source: main/trunk/greenstone2/perllib/unicode.pm@ 34393

Last change on this file since 34393 was 34393, checked in by ak19, 4 years ago

Commit of independent code update before committing a bugfix. In this commit, the results of base64 encoding can have linebreaks, which aren't what we want or expect, so have a flag for removing them.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 24.8 KB
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2004 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33eval {require bytes};
34
35use encodings;
36use strict;
37use util;
38use FileUtils;
39use MIME::Base64; # for base64 encoding
40
41no strict 'refs';
42
43
44
45sub utf8decomp
46{
47 my ($str) = @_;
48
49 return if (!defined $str);
50 return "" if ($str eq "");
51
52 my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
53
54 my @each_char
55 = map { ($_ > 255 )
56 ? # if wide character...
57 sprintf("\\x{%04X}", $_)
58 : # \x{...}
59 (chr($_) =~ m/[[:cntrl:]]/ )
60 ? # else if control character ...
61 sprintf("\\x%02X", $_)
62 : # \x..
63 quotemeta(chr($_)) # else quoted or as themselves
64 } @unpacked_chars;
65
66 return join("",@each_char);
67}
68
69
70sub hex_codepoint {
71 if (my $char = shift) {
72 return sprintf '%2.2x', unpack('U0U*', $char);
73 }
74}
75
76
77
78
79# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
80# and returns a unicode array.
81sub ascii2unicode {
82 my ($in) = @_;
83 my $out = [];
84
85 my $i = 0;
86 my $len = length($in);
87 while ($i < $len) {
88 push (@$out, ord(substr ($in, $i, 1)));
89 $i++;
90 }
91
92 return $out;
93}
94
95# ascii2utf8 takes a reference to an (extended) ascii string and returns a
96# UTF-8 encoded string. This is just a faster version of
97# "&unicode2utf8(&ascii2unicode($str));"
98# "Extended ascii" really means "iso_8859_1"
99sub ascii2utf8 {
100 my ($in) = @_;
101 my $out = "";
102
103 if (!defined($in)|| !defined($$in)) {
104 return $out;
105 }
106
107 my ($c);
108 my $i = 0;
109 my $len = length($$in);
110 while ($i < $len) {
111 $c = ord (substr ($$in, $i, 1));
112 if ($c < 0x80) {
113 # ascii character
114 $out .= chr ($c);
115
116 } else {
117 # extended ascii character
118 $out .= chr (0xc0 + (($c >> 6) & 0x1f));
119 $out .= chr (0x80 + ($c & 0x3f));
120 }
121 $i++;
122 }
123
124 return $out;
125}
126
127# unicode2utf8 takes a unicode array as input and encodes it
128# using utf-8
129sub unicode2utf8 {
130 my ($in) = @_;
131 my $out = "";
132
133 foreach my $num (@$in) {
134 next unless defined $num;
135 if ($num < 0x80) {
136 $out .= chr ($num);
137
138 } elsif ($num < 0x800) {
139 $out .= chr (0xc0 + (($num >> 6) & 0x1f));
140 $out .= chr (0x80 + ($num & 0x3f));
141
142 } elsif ($num < 0xFFFF) {
143 $out .= chr (0xe0 + (($num >> 12) & 0xf));
144 $out .= chr (0x80 + (($num >> 6) & 0x3f));
145 $out .= chr (0x80 + ($num & 0x3f));
146
147 } else {
148 # error, don't encode anything
149 #die;
150 # Diego's bugfix: instead of aborting the import process, it
151 # is better to get a converted file with a few extra spaces
152 print STDERR "strange char: $num\n";
153 $out .= " ";
154
155 }
156 }
157 return $out;
158}
159
160# utf82unicode takes a utf-8 string and produces a unicode
161# array
162sub utf82unicode {
163 my ($in) = @_;
164 my $out = [];
165
166 if(!defined $in) {
167 return $out;
168 }
169
170 my $i = 0;
171 my ($c1, $c2, $c3);
172 my $len = length($in);
173 while ($i < $len) {
174 if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
175 # normal ascii character
176 push (@$out, $c1);
177
178 } elsif ($c1 < 0xc0) {
179 # error, was expecting the first byte of an
180 # encoded character. Do nothing.
181
182 } elsif ($c1 < 0xe0 && $i+1 < $len) {
183 # an encoded character with two bytes
184 $c2 = ord (substr ($in, $i+1, 1));
185 if ($c2 >= 0x80 && $c2 < 0xc0) {
186 # everything looks ok
187 push (@$out, ((($c1 & 0x1f) << 6) +
188 ($c2 & 0x3f)));
189 $i++; # gobbled an extra byte
190 }
191
192 } elsif ($c1 < 0xf0 && $i+2 < $len) {
193 # an encoded character with three bytes
194 $c2 = ord (substr ($in, $i+1, 1));
195 $c3 = ord (substr ($in, $i+2, 1));
196 if ($c2 >= 0x80 && $c2 < 0xc0 &&
197 $c3 >= 0x80 && $c3 < 0xc0) {
198 # everything looks ok
199 push (@$out, ((($c1 & 0xf) << 12) +
200 (($c2 & 0x3f) << 6) +
201 ($c3 & 0x3f)));
202
203 $i += 2; # gobbled an extra two bytes
204 }
205
206 } else {
207 # error, only decode Unicode characters not full UCS.
208 # Do nothing.
209 }
210
211 $i++;
212 }
213
214 return $out;
215}
216
217# unicode2ucs2 takes a unicode array and produces a UCS-2
218# unicode string (every two bytes forms a unicode character)
219sub unicode2ucs2 {
220 my ($in) = @_;
221 my $out = "";
222
223 foreach my $num (@$in) {
224 $out .= chr (($num & 0xff00) >> 8);
225 $out .= chr ($num & 0xff);
226 }
227
228 return $out;
229}
230
231# ucs22unicode takes a UCS-2 string and produces a unicode array
232sub ucs22unicode {
233 my ($in) = @_;
234 my $out = [];
235
236 my $i = 0;
237 my $len = length ($in);
238 while ($i+1 < $len) {
239 push (@$out, ord (substr($in, $i, 1)) << 8 +
240 ord (substr($in, $i+1, 1)));
241
242 $i ++;
243 }
244
245 return $out;
246}
247
248# takes a reference to a string and returns a reference to a unicode array
249sub convert2unicode {
250 my ($encoding, $textref) = @_;
251
252 if (!defined $encodings::encodings->{$encoding}) {
253 print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
254 return [];
255 }
256
257 my $encodename = "$encoding-unicode";
258 my $enc_info = $encodings::encodings->{$encoding};
259 my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
260 "to_uc", $enc_info->{'mapfile'});
261 if (!&loadmapencoding ($encodename, $mapfile)) {
262 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
263 return [];
264 }
265
266 if (defined $enc_info->{'converter'}) {
267 my $converter = $enc_info->{'converter'};
268 return &$converter ($encodename, $textref);
269 }
270
271 if ($unicode::translations{$encodename}->{'count'} == 1) {
272 return &singlebyte2unicode ($encodename, $textref);
273 } else {
274 return &doublebyte2unicode ($encodename, $textref);
275 }
276}
277
278# singlebyte2unicode converts simple 8 bit encodings where characters below
279# 0x80 are normal ascii characters and the rest are decoded using the
280# appropriate mapping files.
281#
282# Examples of encodings that may be converted using singlebyte2unicode are
283# the iso-8859 and windows-125* series.
284sub singlebyte2unicode {
285 my ($encodename, $textref) = @_;
286
287 my @outtext = ();
288 my $len = length($$textref);
289 my ($c);
290 my $i = 0;
291
292 while ($i < $len) {
293 if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
294 # normal ascii character
295 push (@outtext, $c);
296 } else {
297 $c = &transchar ($encodename, $c);
298 # put a black square if cannot translate
299 $c = 0x25A1 if $c == 0;
300 push (@outtext, $c);
301 }
302 $i ++;
303 }
304 return \@outtext;
305}
306
307# doublebyte2unicode converts simple two byte encodings where characters
308# below code point 0x80 are single-byte characters and the rest are
309# double-byte characters.
310#
311# Examples of encodings that may be converted using doublebyte2unicode are
312# CJK encodings like GB encoded Chinese and UHC Korean.
313#
314# Note that no error checking is performed to make sure that the input text
315# is valid for the given encoding.
316#
317# Also, encodings that may contain characters of more than two bytes are
318# not supported (any EUC encoded text may in theory contain 3-byte
319# characters but in practice only one and two byte characters are used).
320sub doublebyte2unicode {
321 my ($encodename, $textref) = @_;
322
323 my @outtext = ();
324 my $len = length($$textref);
325 my ($c1, $c2);
326 my $i = 0;
327
328 while ($i < $len) {
329 if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
330 if ($i+1 < $len) {
331 # double-byte character
332 $c2 = ord(substr($$textref, $i+1, 1));
333 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
334 # put a black square if cannot translate
335 $c = 0x25A1 if $c == 0;
336 push (@outtext, $c);
337 $i += 2;
338
339 } else {
340 # error
341 print STDERR "unicode: ERROR missing second half of double-byte character\n";
342 $i++;
343 }
344
345 } else {
346 # single-byte character
347 push (@outtext, $c1);
348 $i++;
349 }
350 }
351 return \@outtext;
352}
353
354# Shift-JIS to unicode
355# We can't use doublebyte2unicode for Shift-JIS because it uses some
356# single-byte characters above code point 0x80 (i.e. half-width katakana
357# characters in the range 0xA1-0xDF)
358sub shiftjis2unicode {
359 my ($encodename, $textref) = @_;
360
361 my @outtext = ();
362 my $len = length($$textref);
363 my ($c1, $c2);
364 my $i = 0;
365
366 while ($i < $len) {
367 $c1 = ord(substr($$textref, $i, 1));
368
369 if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
370 # Single-byte half-width katakana character or
371 # JIS Roman yen or overline characters
372 my $c = &transchar ($encodename, $c1);
373 # - put a black square if cannot translate
374 $c = 0x25A1 if $c == 0;
375 push (@outtext, $c);
376 $i++;
377
378 } elsif ($c1 < 0x80) {
379 # ASCII
380 push (@outtext, $c1);
381 $i ++;
382
383 } elsif ($c1 < 0xEF) {
384 if ($i+1 < $len) {
385 $c2 = ord(substr($$textref, $i+1, 1));
386 if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
387 # Double-byte shift-jis character
388 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
389 # put a black square if cannot translate
390 $c = 0x25A1 if $c == 0;
391 push (@outtext, $c);
392 } else {
393 # error
394 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
395 }
396 $i += 2;
397 } else {
398 # error
399 print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
400 $i ++;
401 }
402 } else {
403 # error
404 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
405 $i ++;
406 }
407 }
408 return \@outtext;
409}
410
411sub transchar {
412 my ($encoding, $from) = @_;
413 my $high = ($from / 256) % 256;
414 my $low = $from % 256;
415
416 return 0 unless defined $unicode::translations{$encoding};
417
418 my $block = $unicode::translations{$encoding}->{'map'};
419
420 if (ref ($block->[$high]) ne "ARRAY") {
421 return 0;
422 }
423 return $block->[$high]->[$low];
424}
425
426# %translations is of the form:
427#
428# encodings{encodingname-encodingname}->{'map'}->blocktranslation
429# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
430#
431# Any of the top translation blocks can point to an undefined
432# value. This data structure aims to allow fast translation and
433# efficient storage.
434%unicode::translations = ();
435
436# @array256 is used for initialisation, there must be
437# a better way...
438# What about this?: @array256 = (0) x 256;
439@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
440 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
441 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
442 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
443 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
444 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
445 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
446 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
447 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
448 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
449 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
450 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
451 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
452 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
453 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
454 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
455
456# returns 1 if successful, 0 if unsuccessful
457sub loadmapencoding {
458 my ($encoding, $mapfile) = @_;
459
460 # check to see if the encoding has already been loaded
461 return 1 if (defined $unicode::translations{$encoding});
462
463 if (! -r $mapfile || -d $mapfile) {
464 return 0;
465 }
466 return 0 unless open (MAPFILE, $mapfile);
467 binmode (MAPFILE);
468
469 $unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
470 my $block = $unicode::translations{$encoding};
471
472 my ($in,$i,$j);
473 while (1) {
474 my $ret=read(MAPFILE, $in, 1);
475 if (!defined($ret)) { # error
476 print STDERR "unicode.pm: error reading mapfile: $!\n";
477 last;
478 }
479 if ($ret != 1) { last }
480 $i = unpack ("C", $in);
481 $block->{'map'}->[$i] = [@unicode::array256];
482 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
483 my ($n1, $n2) = unpack ("CC", $in);
484 $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
485 }
486 $block->{'count'} ++;
487 }
488
489 close (MAPFILE);
490}
491
492# unicode2singlebyte converts unicode to simple 8 bit encodings where
493# characters below 0x80 are normal ascii characters and the rest are encoded
494# using the appropriate mapping files.
495#
496# Examples of encodings that may be converted using unicode2singlebyte are
497# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
498sub unicode2singlebyte {
499 my ($uniref, $encoding) = @_;
500
501 my $outtext = "";
502 my $encodename = "unicode-$encoding";
503
504 if (!exists $encodings::encodings->{$encoding}) {
505 print STDERR "unicode.pm: ERROR - unsupported encoding "
506 . "'$encoding' requested\n";
507 return "";
508 }
509
510 my $enc_info = $encodings::encodings->{$encoding};
511 my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
512 "from_uc", $enc_info->{'mapfile'});
513 if (!&loadmapencoding ($encodename, $mapfile)) {
514 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
515 return "";
516 }
517
518 foreach my $c (@$uniref) {
519 if ($c < 0x80) {
520 # normal ascii character
521 $outtext .= chr($c);
522 } else {
523 # extended ascii character
524 $c = &transchar ($encodename, $c);
525
526 # put a question mark if cannot translate
527 if ($c == 0) {
528 $outtext .= "?";
529 } else {
530 $outtext .= chr($c);
531 }
532 }
533 }
534 return $outtext;
535}
536
537
538# this makes sure that the referenced input string is utf8 encoded, and
539# will change/remove bytes that aren't.
540# returns 0 if the text was already utf8, or 1 if text modified to become utf8
541sub ensure_utf8 {
542 my $stringref=shift;
543
544 if (!defined($stringref) || ref($stringref) ne 'SCALAR') {
545 return $stringref;
546 }
547
548 my $value=$$stringref;
549
550 my $non_utf8_found = 0;
551 $value =~ m/^/g; # to set \G
552 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
553 my $highbytes=$1;
554 my $highbyteslength=length($highbytes);
555 # make sure this block of high bytes is utf-8
556 $highbytes =~ /^/g; # set pos()
557 my $byte_replaced = 0;
558 while ($highbytes =~
559 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
560 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
561 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
562 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
563 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
564 )*([\x80-\xff])? !xg
565 ) {
566 # this highbyte is "out-of-place" for valid utf-8
567 my $badbyte=$1;
568 if (!defined $badbyte) {next} # hit end of string
569 my $pos=pos($highbytes);
570 # replace bad byte. assume iso-8859-1 -> utf-8
571 # ascii2utf8 does "extended ascii"... ie iso-8859-1
572 my $replacement=&unicode::ascii2utf8(\$badbyte);
573 substr($highbytes, $pos-1, 1, $replacement);
574 # update the position to continue searching (for \G)
575 pos($highbytes) = $pos+length($replacement)-1;
576 $byte_replaced = 1;
577 }
578 if ($byte_replaced) {
579 # replace this block of high bytes in the $value
580 $non_utf8_found = 1;
581 my $replength=length($highbytes); # we've changed the length
582 my $textpos=pos($value); # pos at end of last match
583 # replace bad bytes with good bytes
584 substr($value, $textpos-$highbyteslength,
585 $highbyteslength, $highbytes);
586 # update the position to continue searching (for \G)
587 pos($value)=$textpos+($replength-$highbyteslength)+1;
588 }
589 }
590
591 $$stringref = $value;
592 return $non_utf8_found;
593}
594
595# Returns true (1) if the given string is utf8 and false (0) if it isn't.
596# Does not modify the string parameter.
597sub check_is_utf8 {
598 my $value=shift;
599
600 if (!defined($value)) {
601 return 0; # not utf8 because it is undefined
602 }
603
604 $value =~ m/^/g; # to set \G
605 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
606 my $highbytes=$1;
607 # make sure this block of high bytes is utf-8
608 $highbytes =~ /^/g; # set pos()
609 while ($highbytes =~
610 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
611 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
612 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
613 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
614 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
615 )*([\x80-\xff])? !xg
616 ) {
617 my $badbyte=$1;
618 if (defined $badbyte) { # not end of string
619 return 0; # non-utf8 found
620 }
621 }
622 }
623
624 return 1;
625}
626
627sub url_encode {
628 my ($text) = @_;
629
630 if (!&is_url_encoded($text)) {
631 $text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
632 # return the url-encoded character entity for underscore back to the entity
633 $text =~ s/%26%23095%3B/&\#095;/g;
634 }
635 return $text;
636}
637
638sub url_decode {
639 my ($text,$and_numeric_entities) = @_;
640
641 if(defined $text) {
642 $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
643
644 if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
645 $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
646 $text =~ s/\&\#u?([0-9]+);/pack('C', $1)/ige;
647 }
648 }
649
650 return $text;
651}
652
653sub url_decode_utf8 {
654 my ($text,$and_numeric_entities) = @_;
655
656 $text =~ s/\%([0-9A-F]{2})/pack('b', hex($1))/ige;
657
658 $text = Encode::decode("utf8",$text);
659
660 return $text;
661}
662
663sub is_url_encoded {
664 my ($text) = @_;
665 return ($text =~ m/\%([0-9A-F]{2})/i) || ($text =~ m/\&\#x([0-9A-F]+;)/i) || ($text =~ m/\&\#([0-9]+;)/i);
666}
667
668# When a filename on the filesystem is already URL-encoded, the
669# URL to it will have %25s in place of every % sign, so that
670# URLs in html pages can refer to the URL-encoded filename.
671# This method changes the URL reference back into the actual
672# (URL-encoded) filename on the filesystem by replacing %25 with %.
673sub url_to_filename {
674 my ($text) =@_;
675 $text =~ s/%25/%/g if &is_url_encoded($text);
676 # DM safing would have replaced underscores with character entity &#095;
677 # in SourceFile meta. Undo any such change to get the filename referred to.
678 $text =~ s/&\#095;/_/g;
679 return $text;
680}
681
682# When a filename on the filesystem is already URL-encoded, the
683# URL to it will have %25s in place of every % sign, so that
684# URLs in html pages can refer to the URL-encoded filename.
685# Given a (URL-encoded) filename on the filesystem, this subroutine
686# returns the URL reference string for it by replacing % with %25.
687# The output string will be the same as the input string if the input
688# already contains one or more %25s. This is to prevent processing
689# a url more than once this way.
690sub filename_to_url {
691 my ($text) = @_;
692
693 if($text !~ m/%25/) {
694 $text =~ s/%/%25/g;
695 }
696 return $text;
697}
698
699# Base64 encoding does not encode a pure ASCII to itself. This is important to know.
700# If the $force_encode parameter is true, then this method WILL base64 encode whatever
701# string is passed in, including any plain ASCII string.
702# That means this method could double encode an already encoded string.
703# However, this method is necessary because on the GLI end, we can't detect whether a plain
704# ASCII string has been encoded or not. And if gsdlsourcefilerenamemethod is set to base64,
705# then gli will always attempt to decode all parts of the relative path gsdlsourcefilename
706# (except the "import" prefix, which is special) or none of the parts.
707sub force_base64_encode {
708 my ($text) = @_;
709 my $force_encode = 1;
710 my $no_newlines = 1;
711 return &base64_encode($text, $force_encode, $no_newlines);
712}
713
714sub base64_encode {
715 my ($text, $force_encode, $no_newlines) = @_;
716 if($force_encode || !&conforms_to_mod_base64($text)) {
717 # return entity for underscore to underscore before encoding
718 $text =~ s/&\#095;/_/g;
719
720 $text = &MIME::Base64::encode_base64($text);
721 # base64 encoding may introduce + and / signs,
722 # replacing them with - and _ to ensure it's filename-safe
723 $text =~ s/\+/\-/g; # + -> -
724 $text =~ s/\//\_/g; # / -> _
725
726 # by default base64 encoding a long string introduces newlines to break long strings,
727 # see https://stackoverflow.com/questions/19952621/is-it-ok-to-remove-newline-in-base64-encoding
728 # Java doesn't like that
729 $text =~ s/\n//gs if $no_newlines;
730 }
731 return $text;
732}
733
734# If the input fits the modified base64 pattern, this will try decoding it.
735# Still, this method does not guarantee the return value is the 'original', only
736# that the result is where the base64 decoding process has been applied once.
737# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
738sub base64_decode {
739 my ($text) = @_;
740 if(&conforms_to_mod_base64($text)) {
741 # base64 encodes certain chars with + and /, but if we'd encoded it, we'd
742 # have replaced them with - and _ respectively. Undo this before decoding.
743 $text =~ s/\-/\+/g; # - -> +
744 $text =~ s/\_/\//g; # _ -> /
745 $text = &MIME::Base64::decode_base64($text);
746 }
747 return $text;
748}
749
750# Returns true if the given string is compatible with a modified version
751# of base64 (where the + and / are replaced with - and _), a format which
752# includes also regular ASCII alphanumeric values. This method does not
753# guarantee that the given string is actually base64 encoded, since it will
754# return true for any simple alphanumeric ASCII string as well.
755sub conforms_to_mod_base64 {
756 my ($text) = @_;
757
758 # need to treat the entity ref for underscore as underscore
759 $text =~ s/&\#095;/_/g;
760
761 # base 64 takes alphanumeric and [=+/],
762 # but we use modified base64 where + and / are replaced with - and _
763 return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
764}
765
766sub substr
767{
768 my ($utf8_string, $offset, $length) = @_;
769
770 my @unicode_string = @{&utf82unicode($utf8_string)};
771 my $unicode_string_length = scalar(@unicode_string);
772
773 my $substr_start = $offset;
774 if ($substr_start >= $unicode_string_length) {
775 return "";
776 }
777
778 my $substr_end = $offset + $length - 1;
779 if ($substr_end >= $unicode_string_length) {
780 $substr_end = $unicode_string_length - 1;
781 }
782
783 my @unicode_substring = @unicode_string[$substr_start..$substr_end];
784 return &unicode2utf8(\@unicode_substring);
785}
786
787# perl version of stringToHex
788# Useful method to print UTF8 (or other unicode) for debugging.
789# Characters that are easily displayed (that is, printable ASCII)
790# are shown as-is, whereas hex values of the unicode code points
791# are shown for all other chars.
792sub debug_unicode_string
793{
794 join("",
795 map { $_ > 127 ? # if wide character...
796 sprintf("\\x{%04X}", $_) : # \x{...}
797 chr($_)
798 } unpack("U*", $_[0])); # unpack Unicode characters
799}
800
801sub raw_filename_to_url_encoded
802{
803 my ($str_in) = @_;
804
805 my @url_encoded_chars
806 = map { $_ > 255 ? # Needs to be represent in entity form
807 sprintf("&#x%X;",$_) :
808 $_>127 || $_==ord("%") ? # Representable in %XX form
809 sprintf("%%%2X", $_) :
810 chr($_) # otherwise, Ascii char
811 } unpack("U*", $str_in); # Unpack Unicode characters
812
813
814 my $str_out = join("", @url_encoded_chars);
815
816 return $str_out;
817
818}
819
820sub url_encoded_to_raw_filename
821{
822 my ($str_in) = @_;
823
824 my $str_out = $str_in;
825
826 $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
827 $str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
828 $str_out =~ s/&#([0-9]+);/chr($1)/eig;
829
830 return $str_out;
831}
832
833
834sub raw_filename_to_utf8_url_encoded
835{
836 my ($str_in) = @_;
837
838 $str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
839
840 my @url_encoded_chars
841 = map { $_ > 127 ? # Representable in %XX form
842 sprintf("%%%2X", $_) :
843 chr($_) # otherwise, Ascii char
844 } unpack("U*", $str_in); # Unpack utf8 characters
845
846
847 my $str_out = join("", @url_encoded_chars);
848
849 return $str_out;
850
851}
852
853sub utf8_url_encoded_to_raw_filename
854{
855 my ($str_in) = @_;
856
857 my $utf8_str_out = $str_in;
858
859 $utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
860
861 my $unicode_str_out = decode("utf8",$utf8_str_out);
862 my $raw_str_out = utf8::downgrade($unicode_str_out);
863
864 return $raw_str_out;
865}
866
867sub analyze_raw_string
868{
869 my ($str_in) = @_;
870
871 my $uses_bytecodes = 0;
872 my $exceeds_bytecodes = 0;
873
874 map { $exceeds_bytecodes = 1 if ($_ >= 256);
875 $uses_bytecodes = 1 if (($_ >= 128) && ($_ < 256));
876 } unpack("U*", $str_in); # Unpack Unicode characters
877
878 return ($uses_bytecodes,$exceeds_bytecodes);
879}
880
881
882sub convert_utf8_string_to_unicode_string
883{
884 my $utf8_string = shift(@_);
885
886 my $unicode_string = "";
887 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
888 $unicode_string .= chr($unicode_value);
889 }
890 return $unicode_string;
891}
892
893sub convert_unicode_string_to_utf8_string
894{
895 my $unicode_string = shift(@_);
896
897 my @unicode_array;
898 for (my $i = 0; $i < length($unicode_string); $i++) {
899 push(@unicode_array, ord(&substr($unicode_string, $i, 1)));
900 }
901 return &unicode::unicode2utf8(\@unicode_array);
902}
903
904
9051;
Note: See TracBrowser for help on using the repository browser.