source: main/trunk/greenstone2/perllib/unicode.pm@ 27306

Last change on this file since 27306 was 27306, checked in by jmt12, 11 years ago

Moving the critical file-related functions (copy, rm, etc) out of util.pm into their own proper class FileUtils. Use of the old functions in util.pm will prompt deprecated warning messages. There may be further functions that could be moved across in the future, but these are the critical ones when considering supporting other filesystems (HTTP, HDFS, WebDav, etc). Updated some key files to use the new functions so now deprecated messages thrown when importing/building demo collection 'out of the box'

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 23.4 KB
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2004 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33eval {require bytes};
34
35use encodings;
36use strict;
37use util;
38use FileUtils;
39use MIME::Base64; # for base64 encoding
40
41no strict 'refs';
42
43
44
45sub utf8decomp
46{
47 my ($str) = @_;
48
49 return if (!defined $str);
50 return "" if ($str eq "");
51
52 my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
53
54 my @each_char
55 = map { ($_ > 255 )
56 ? # if wide character...
57 sprintf("\\x{%04X}", $_)
58 : # \x{...}
59 (chr($_) =~ m/[[:cntrl:]]/ )
60 ? # else if control character ...
61 sprintf("\\x%02X", $_)
62 : # \x..
63 quotemeta(chr($_)) # else quoted or as themselves
64 } @unpacked_chars;
65
66 return join("",@each_char);
67}
68
69
70sub hex_codepoint {
71 if (my $char = shift) {
72 return sprintf '%2.2x', unpack('U0U*', $char);
73 }
74}
75
76
77
78
79# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
80# and returns a unicode array.
81sub ascii2unicode {
82 my ($in) = @_;
83 my $out = [];
84
85 my $i = 0;
86 my $len = length($in);
87 while ($i < $len) {
88 push (@$out, ord(substr ($in, $i, 1)));
89 $i++;
90 }
91
92 return $out;
93}
94
95# ascii2utf8 takes a reference to an (extended) ascii string and returns a
96# UTF-8 encoded string. This is just a faster version of
97# "&unicode2utf8(&ascii2unicode($str));"
98# "Extended ascii" really means "iso_8859_1"
99sub ascii2utf8 {
100 my ($in) = @_;
101 my $out = "";
102
103 if (!defined($in)|| !defined($$in)) {
104 return $out;
105 }
106
107 my ($c);
108 my $i = 0;
109 my $len = length($$in);
110 while ($i < $len) {
111 $c = ord (substr ($$in, $i, 1));
112 if ($c < 0x80) {
113 # ascii character
114 $out .= chr ($c);
115
116 } else {
117 # extended ascii character
118 $out .= chr (0xc0 + (($c >> 6) & 0x1f));
119 $out .= chr (0x80 + ($c & 0x3f));
120 }
121 $i++;
122 }
123
124 return $out;
125}
126
127# unicode2utf8 takes a unicode array as input and encodes it
128# using utf-8
129sub unicode2utf8 {
130 my ($in) = @_;
131 my $out = "";
132
133 foreach my $num (@$in) {
134 next unless defined $num;
135 if ($num < 0x80) {
136 $out .= chr ($num);
137
138 } elsif ($num < 0x800) {
139 $out .= chr (0xc0 + (($num >> 6) & 0x1f));
140 $out .= chr (0x80 + ($num & 0x3f));
141
142 } elsif ($num < 0xFFFF) {
143 $out .= chr (0xe0 + (($num >> 12) & 0xf));
144 $out .= chr (0x80 + (($num >> 6) & 0x3f));
145 $out .= chr (0x80 + ($num & 0x3f));
146
147 } else {
148 # error, don't encode anything
149 die;
150 }
151 }
152 return $out;
153}
154
155# utf82unicode takes a utf-8 string and produces a unicode
156# array
157sub utf82unicode {
158 my ($in) = @_;
159 my $out = [];
160
161 my $i = 0;
162 my ($c1, $c2, $c3);
163 my $len = length($in);
164 while ($i < $len) {
165 if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
166 # normal ascii character
167 push (@$out, $c1);
168
169 } elsif ($c1 < 0xc0) {
170 # error, was expecting the first byte of an
171 # encoded character. Do nothing.
172
173 } elsif ($c1 < 0xe0 && $i+1 < $len) {
174 # an encoded character with two bytes
175 $c2 = ord (substr ($in, $i+1, 1));
176 if ($c2 >= 0x80 && $c2 < 0xc0) {
177 # everything looks ok
178 push (@$out, ((($c1 & 0x1f) << 6) +
179 ($c2 & 0x3f)));
180 $i++; # gobbled an extra byte
181 }
182
183 } elsif ($c1 < 0xf0 && $i+2 < $len) {
184 # an encoded character with three bytes
185 $c2 = ord (substr ($in, $i+1, 1));
186 $c3 = ord (substr ($in, $i+2, 1));
187 if ($c2 >= 0x80 && $c2 < 0xc0 &&
188 $c3 >= 0x80 && $c3 < 0xc0) {
189 # everything looks ok
190 push (@$out, ((($c1 & 0xf) << 12) +
191 (($c2 & 0x3f) << 6) +
192 ($c3 & 0x3f)));
193
194 $i += 2; # gobbled an extra two bytes
195 }
196
197 } else {
198 # error, only decode Unicode characters not full UCS.
199 # Do nothing.
200 }
201
202 $i++;
203 }
204
205 return $out;
206}
207
208# unicode2ucs2 takes a unicode array and produces a UCS-2
209# unicode string (every two bytes forms a unicode character)
210sub unicode2ucs2 {
211 my ($in) = @_;
212 my $out = "";
213
214 foreach my $num (@$in) {
215 $out .= chr (($num & 0xff00) >> 8);
216 $out .= chr ($num & 0xff);
217 }
218
219 return $out;
220}
221
222# ucs22unicode takes a UCS-2 string and produces a unicode array
223sub ucs22unicode {
224 my ($in) = @_;
225 my $out = [];
226
227 my $i = 0;
228 my $len = length ($in);
229 while ($i+1 < $len) {
230 push (@$out, ord (substr($in, $i, 1)) << 8 +
231 ord (substr($in, $i+1, 1)));
232
233 $i ++;
234 }
235
236 return $out;
237}
238
239# takes a reference to a string and returns a reference to a unicode array
240sub convert2unicode {
241 my ($encoding, $textref) = @_;
242
243 if (!defined $encodings::encodings->{$encoding}) {
244 print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
245 return [];
246 }
247
248 my $encodename = "$encoding-unicode";
249 my $enc_info = $encodings::encodings->{$encoding};
250 my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
251 "to_uc", $enc_info->{'mapfile'});
252 if (!&loadmapencoding ($encodename, $mapfile)) {
253 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
254 return [];
255 }
256
257 if (defined $enc_info->{'converter'}) {
258 my $converter = $enc_info->{'converter'};
259 return &$converter ($encodename, $textref);
260 }
261
262 if ($unicode::translations{$encodename}->{'count'} == 1) {
263 return &singlebyte2unicode ($encodename, $textref);
264 } else {
265 return &doublebyte2unicode ($encodename, $textref);
266 }
267}
268
269# singlebyte2unicode converts simple 8 bit encodings where characters below
270# 0x80 are normal ascii characters and the rest are decoded using the
271# appropriate mapping files.
272#
273# Examples of encodings that may be converted using singlebyte2unicode are
274# the iso-8859 and windows-125* series.
275sub singlebyte2unicode {
276 my ($encodename, $textref) = @_;
277
278 my @outtext = ();
279 my $len = length($$textref);
280 my ($c);
281 my $i = 0;
282
283 while ($i < $len) {
284 if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
285 # normal ascii character
286 push (@outtext, $c);
287 } else {
288 $c = &transchar ($encodename, $c);
289 # put a black square if cannot translate
290 $c = 0x25A1 if $c == 0;
291 push (@outtext, $c);
292 }
293 $i ++;
294 }
295 return \@outtext;
296}
297
298# doublebyte2unicode converts simple two byte encodings where characters
299# below code point 0x80 are single-byte characters and the rest are
300# double-byte characters.
301#
302# Examples of encodings that may be converted using doublebyte2unicode are
303# CJK encodings like GB encoded Chinese and UHC Korean.
304#
305# Note that no error checking is performed to make sure that the input text
306# is valid for the given encoding.
307#
308# Also, encodings that may contain characters of more than two bytes are
309# not supported (any EUC encoded text may in theory contain 3-byte
310# characters but in practice only one and two byte characters are used).
311sub doublebyte2unicode {
312 my ($encodename, $textref) = @_;
313
314 my @outtext = ();
315 my $len = length($$textref);
316 my ($c1, $c2);
317 my $i = 0;
318
319 while ($i < $len) {
320 if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
321 if ($i+1 < $len) {
322 # double-byte character
323 $c2 = ord(substr($$textref, $i+1, 1));
324 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
325 # put a black square if cannot translate
326 $c = 0x25A1 if $c == 0;
327 push (@outtext, $c);
328 $i += 2;
329
330 } else {
331 # error
332 print STDERR "unicode: ERROR missing second half of double-byte character\n";
333 $i++;
334 }
335
336 } else {
337 # single-byte character
338 push (@outtext, $c1);
339 $i++;
340 }
341 }
342 return \@outtext;
343}
344
345# Shift-JIS to unicode
346# We can't use doublebyte2unicode for Shift-JIS because it uses some
347# single-byte characters above code point 0x80 (i.e. half-width katakana
348# characters in the range 0xA1-0xDF)
349sub shiftjis2unicode {
350 my ($encodename, $textref) = @_;
351
352 my @outtext = ();
353 my $len = length($$textref);
354 my ($c1, $c2);
355 my $i = 0;
356
357 while ($i < $len) {
358 $c1 = ord(substr($$textref, $i, 1));
359
360 if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
361 # Single-byte half-width katakana character or
362 # JIS Roman yen or overline characters
363 my $c = &transchar ($encodename, $c1);
364 # - put a black square if cannot translate
365 $c = 0x25A1 if $c == 0;
366 push (@outtext, $c);
367 $i++;
368
369 } elsif ($c1 < 0x80) {
370 # ASCII
371 push (@outtext, $c1);
372 $i ++;
373
374 } elsif ($c1 < 0xEF) {
375 if ($i+1 < $len) {
376 $c2 = ord(substr($$textref, $i+1, 1));
377 if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
378 # Double-byte shift-jis character
379 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
380 # put a black square if cannot translate
381 $c = 0x25A1 if $c == 0;
382 push (@outtext, $c);
383 } else {
384 # error
385 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
386 }
387 $i += 2;
388 } else {
389 # error
390 print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
391 $i ++;
392 }
393 } else {
394 # error
395 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
396 $i ++;
397 }
398 }
399 return \@outtext;
400}
401
402sub transchar {
403 my ($encoding, $from) = @_;
404 my $high = ($from / 256) % 256;
405 my $low = $from % 256;
406
407 return 0 unless defined $unicode::translations{$encoding};
408
409 my $block = $unicode::translations{$encoding}->{'map'};
410
411 if (ref ($block->[$high]) ne "ARRAY") {
412 return 0;
413 }
414 return $block->[$high]->[$low];
415}
416
417# %translations is of the form:
418#
419# encodings{encodingname-encodingname}->{'map'}->blocktranslation
420# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
421#
422# Any of the top translation blocks can point to an undefined
423# value. This data structure aims to allow fast translation and
424# efficient storage.
425%unicode::translations = ();
426
427# @array256 is used for initialisation, there must be
428# a better way...
429# What about this?: @array256 = (0) x 256;
430@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
431 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
432 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
433 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
434 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
435 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
436 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
437 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
438 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
439 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
440 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
441 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
442 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
443 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
444 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
445 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
446
447# returns 1 if successful, 0 if unsuccessful
448sub loadmapencoding {
449 my ($encoding, $mapfile) = @_;
450
451 # check to see if the encoding has already been loaded
452 return 1 if (defined $unicode::translations{$encoding});
453
454 if (! -r $mapfile || -d $mapfile) {
455 return 0;
456 }
457 return 0 unless open (MAPFILE, $mapfile);
458 binmode (MAPFILE);
459
460 $unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
461 my $block = $unicode::translations{$encoding};
462
463 my ($in,$i,$j);
464 while (1) {
465 my $ret=read(MAPFILE, $in, 1);
466 if (!defined($ret)) { # error
467 print STDERR "unicode.pm: error reading mapfile: $!\n";
468 last;
469 }
470 if ($ret != 1) { last }
471 $i = unpack ("C", $in);
472 $block->{'map'}->[$i] = [@unicode::array256];
473 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
474 my ($n1, $n2) = unpack ("CC", $in);
475 $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
476 }
477 $block->{'count'} ++;
478 }
479
480 close (MAPFILE);
481}
482
483# unicode2singlebyte converts unicode to simple 8 bit encodings where
484# characters below 0x80 are normal ascii characters and the rest are encoded
485# using the appropriate mapping files.
486#
487# Examples of encodings that may be converted using unicode2singlebyte are
488# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
489sub unicode2singlebyte {
490 my ($uniref, $encoding) = @_;
491
492 my $outtext = "";
493 my $encodename = "unicode-$encoding";
494
495 if (!exists $encodings::encodings->{$encoding}) {
496 print STDERR "unicode.pm: ERROR - unsupported encoding "
497 . "'$encoding' requested\n";
498 return "";
499 }
500
501 my $enc_info = $encodings::encodings->{$encoding};
502 my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
503 "from_uc", $enc_info->{'mapfile'});
504 if (!&loadmapencoding ($encodename, $mapfile)) {
505 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
506 return "";
507 }
508
509 foreach my $c (@$uniref) {
510 if ($c < 0x80) {
511 # normal ascii character
512 $outtext .= chr($c);
513 } else {
514 # extended ascii character
515 $c = &transchar ($encodename, $c);
516
517 # put a question mark if cannot translate
518 if ($c == 0) {
519 $outtext .= "?";
520 } else {
521 $outtext .= chr($c);
522 }
523 }
524 }
525 return $outtext;
526}
527
528
529# this makes sure that the referenced input string is utf8 encoded, and
530# will change/remove bytes that aren't.
531# returns 0 if the text was already utf8, or 1 if text modified to become utf8
532sub ensure_utf8 {
533 my $stringref=shift;
534
535 if (!defined($stringref) || ref($stringref) ne 'SCALAR') {
536 return $stringref;
537 }
538
539 my $value=$$stringref;
540
541 my $non_utf8_found = 0;
542 $value =~ m/^/g; # to set \G
543 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
544 my $highbytes=$1;
545 my $highbyteslength=length($highbytes);
546 # make sure this block of high bytes is utf-8
547 $highbytes =~ /^/g; # set pos()
548 my $byte_replaced = 0;
549 while ($highbytes =~
550 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
551 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
552 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
553 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
554 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
555 )*([\x80-\xff])? !xg
556 ) {
557 # this highbyte is "out-of-place" for valid utf-8
558 my $badbyte=$1;
559 if (!defined $badbyte) {next} # hit end of string
560 my $pos=pos($highbytes);
561 # replace bad byte. assume iso-8859-1 -> utf-8
562 # ascii2utf8 does "extended ascii"... ie iso-8859-1
563 my $replacement=&unicode::ascii2utf8(\$badbyte);
564 substr($highbytes, $pos-1, 1, $replacement);
565 # update the position to continue searching (for \G)
566 pos($highbytes) = $pos+length($replacement)-1;
567 $byte_replaced = 1;
568 }
569 if ($byte_replaced) {
570 # replace this block of high bytes in the $value
571 $non_utf8_found = 1;
572 my $replength=length($highbytes); # we've changed the length
573 my $textpos=pos($value); # pos at end of last match
574 # replace bad bytes with good bytes
575 substr($value, $textpos-$highbyteslength,
576 $highbyteslength, $highbytes);
577 # update the position to continue searching (for \G)
578 pos($value)=$textpos+($replength-$highbyteslength)+1;
579 }
580 }
581
582 $$stringref = $value;
583 return $non_utf8_found;
584}
585
586# Returns true (1) if the given string is utf8 and false (0) if it isn't.
587# Does not modify the string parameter.
588sub check_is_utf8 {
589 my $value=shift;
590
591 if (!defined($value)) {
592 return 0; # not utf8 because it is undefined
593 }
594
595 $value =~ m/^/g; # to set \G
596 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
597 my $highbytes=$1;
598 # make sure this block of high bytes is utf-8
599 $highbytes =~ /^/g; # set pos()
600 while ($highbytes =~
601 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
602 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
603 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
604 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
605 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
606 )*([\x80-\xff])? !xg
607 ) {
608 my $badbyte=$1;
609 if (defined $badbyte) { # not end of string
610 return 0; # non-utf8 found
611 }
612 }
613 }
614
615 return 1;
616}
617
618sub url_encode {
619 my ($text) = @_;
620
621 if (!&is_url_encoded($text)) {
622 $text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
623 # return the url-encoded character entity for underscore back to the entity
624 $text =~ s/%26%23095%3B/&\#095;/g;
625 }
626 return $text;
627}
628
629sub url_decode {
630 my ($text,$and_numeric_entities) = @_;
631
632 $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
633
634 if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
635 $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
636 $text =~ s/\&\#u?([0-9]+);/pack('C', $1)/ige;
637 }
638
639 return $text;
640}
641
642sub url_decode_utf8 {
643 my ($text,$and_numeric_entities) = @_;
644
645 $text =~ s/\%([0-9A-F]{2})/pack('b', hex($1))/ige;
646
647 $text = Encode::decode("utf8",$text);
648
649 return $text;
650}
651
652sub is_url_encoded {
653 my ($text) = @_;
654 return ($text =~ m/\%([0-9A-F]{2})/i) || ($text =~ m/\&\#x([0-9A-F]+;)/i) || ($text =~ m/\&\#([0-9]+;)/i);
655}
656
657# When a filename on the filesystem is already URL-encoded, the
658# URL to it will have %25s in place of every % sign, so that
659# URLs in html pages can refer to the URL-encoded filename.
660# This method changes the URL reference back into the actual
661# (URL-encoded) filename on the filesystem by replacing %25 with %.
662sub url_to_filename {
663 my ($text) =@_;
664 $text =~ s/%25/%/g if &is_url_encoded($text);
665 # DM safing would have replaced underscores with character entity &#095;
666 # in SourceFile meta. Undo any such change to get the filename referred to.
667 $text =~ s/&\#095;/_/g;
668 return $text;
669}
670
671# When a filename on the filesystem is already URL-encoded, the
672# URL to it will have %25s in place of every % sign, so that
673# URLs in html pages can refer to the URL-encoded filename.
674# Given a (URL-encoded) filename on the filesystem, this subroutine
675# returns the URL reference string for it by replacing % with %25.
676# The output string will be the same as the input string if the input
677# already contains one or more %25s. This is to prevent processing
678# a url more than once this way.
679sub filename_to_url {
680 my ($text) = @_;
681
682 if($text !~ m/%25/) {
683 $text =~ s/%/%25/g;
684 }
685 return $text;
686}
687
688sub base64_encode {
689 my ($text) = @_;
690 if(!&conforms_to_mod_base64($text)) {
691 # return entity for underscore to underscore before encoding
692 $text =~ s/&\#095;/_/g;
693
694 $text = &MIME::Base64::encode_base64($text);
695 # base64 encoding may introduce + and / signs,
696 # replacing them with - and _ to ensure it's filename-safe
697 $text =~ s/\+/\-/g; # + -> -
698 $text =~ s/\//\_/g; # / -> _
699 }
700 return $text;
701}
702
703# If the input fits the modified base64 pattern, this will try decoding it.
704# Still, this method does not guarantee the return value is the 'original', only
705# that the result is where the base64 decoding process has been applied once.
706# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
707sub base64_decode {
708 my ($text) = @_;
709 if(&conforms_to_mod_base64($text)) {
710 # base64 encodes certain chars with + and /, but if we'd encoded it, we'd
711 # have replaced them with - and _ respectively. Undo this before decoding.
712 $text =~ s/\-/\+/g; # - -> +
713 $text =~ s/\_/\//g; # _ -> /
714 $text = &MIME::Base64::decode_base64($text);
715 }
716 return $text;
717}
718
719# Returns true if the given string is compatible with a modified version
720# of base64 (where the + and / are replaced with - and _), a format which
721# includes also regular ASCII alphanumeric values. This method does not
722# guarantee that the given string is actually base64 encoded, since it will
723# return true for any simple alphanumeric ASCII string as well.
724sub conforms_to_mod_base64 {
725 my ($text) = @_;
726
727 # need to treat the entity ref for underscore as underscore
728 $text =~ s/&\#095;/_/g;
729
730 # base 64 takes alphanumeric and [=+/],
731 # but we use modified base64 where + and / are replaced with - and _
732 return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
733}
734
735sub substr
736{
737 my ($utf8_string, $offset, $length) = @_;
738
739 my @unicode_string = @{&utf82unicode($utf8_string)};
740 my $unicode_string_length = scalar(@unicode_string);
741
742 my $substr_start = $offset;
743 if ($substr_start >= $unicode_string_length) {
744 return "";
745 }
746
747 my $substr_end = $offset + $length - 1;
748 if ($substr_end >= $unicode_string_length) {
749 $substr_end = $unicode_string_length - 1;
750 }
751
752 my @unicode_substring = @unicode_string[$substr_start..$substr_end];
753 return &unicode2utf8(\@unicode_substring);
754}
755
756# Useful method to print UTF8 (or other unicode) for debugging.
757# Characters that are easily displayed (that is, printable ASCII)
758# are shown as-is, whereas hex values of the unicode code points
759# are shown for all other chars.
760sub debug_unicode_string
761{
762 join("",
763 map { $_ > 127 ? # if wide character...
764 sprintf("\\x{%04X}", $_) : # \x{...}
765 chr($_)
766 } unpack("U*", $_[0])); # unpack Unicode characters
767}
768
769
770sub raw_filename_to_url_encoded
771{
772 my ($str_in) = @_;
773
774 my @url_encoded_chars
775 = map { $_ > 255 ? # Needs to be represent in entity form
776 sprintf("&#x%X;",$_) :
777 $_>127 || $_==ord("%") ? # Representable in %XX form
778 sprintf("%%%2X", $_) :
779 chr($_) # otherwise, Ascii char
780 } unpack("U*", $str_in); # Unpack Unicode characters
781
782
783 my $str_out = join("", @url_encoded_chars);
784
785 return $str_out;
786
787}
788
789sub url_encoded_to_raw_filename
790{
791 my ($str_in) = @_;
792
793 my $str_out = $str_in;
794
795 $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
796 $str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
797 $str_out =~ s/&#([0-9]+);/chr($1)/eig;
798
799 return $str_out;
800}
801
802
803sub raw_filename_to_utf8_url_encoded
804{
805 my ($str_in) = @_;
806
807 $str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
808
809 my @url_encoded_chars
810 = map { $_ > 127 ? # Representable in %XX form
811 sprintf("%%%2X", $_) :
812 chr($_) # otherwise, Ascii char
813 } unpack("U*", $str_in); # Unpack utf8 characters
814
815
816 my $str_out = join("", @url_encoded_chars);
817
818 return $str_out;
819
820}
821
822sub utf8_url_encoded_to_raw_filename
823{
824 my ($str_in) = @_;
825
826 my $utf8_str_out = $str_in;
827
828 $utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
829
830 my $unicode_str_out = decode("utf8",$utf8_str_out);
831 my $raw_str_out = utf8::downgrade($unicode_str_out);
832
833 return $raw_str_out;
834}
835
836sub analyze_raw_string
837{
838 my ($str_in) = @_;
839
840 my $uses_bytecodes = 0;
841 my $exceeds_bytecodes = 0;
842
843 map { $exceeds_bytecodes = 1 if ($_ >= 256);
844 $uses_bytecodes = 1 if (($_ >= 128) && ($_ < 256));
845 } unpack("U*", $str_in); # Unpack Unicode characters
846
847 return ($uses_bytecodes,$exceeds_bytecodes);
848}
849
850
851sub convert_utf8_string_to_unicode_string
852{
853 my $utf8_string = shift(@_);
854
855 my $unicode_string = "";
856 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
857 $unicode_string .= chr($unicode_value);
858 }
859 return $unicode_string;
860}
861
862sub convert_unicode_string_to_utf8_string
863{
864 my $unicode_string = shift(@_);
865
866 my @unicode_array;
867 for (my $i = 0; $i < length($unicode_string); $i++) {
868 push(@unicode_array, ord(&substr($unicode_string, $i, 1)));
869 }
870 return &unicode::unicode2utf8(\@unicode_array);
871}
872
873
8741;
Note: See TracBrowser for help on using the repository browser.