source: main/trunk/greenstone2/perllib/unicode.pm

Last change on this file was 35165, checked in by kjdon, 3 years ago

updated to and from utf8 methods to handle 4 byte utf8 characters - emojis are in this space.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 26.2 KB
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2004 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33eval {require bytes};
34
35use encodings;
36use strict;
37use util;
38use FileUtils;
39use MIME::Base64; # for base64 encoding
40
41no strict 'refs';
42
43
44
45sub utf8decomp
46{
47 my ($str) = @_;
48
49 return if (!defined $str);
50 return "" if ($str eq "");
51
52 my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
53
54 my @each_char
55 = map { ($_ > 255 )
56 ? # if wide character...
57 sprintf("\\x{%04X}", $_)
58 : # \x{...}
59 (chr($_) =~ m/[[:cntrl:]]/ )
60 ? # else if control character ...
61 sprintf("\\x%02X", $_)
62 : # \x..
63 quotemeta(chr($_)) # else quoted or as themselves
64 } @unpacked_chars;
65
66 return join("",@each_char);
67}
68
69
70sub hex_codepoint {
71 if (my $char = shift) {
72 return sprintf '%2.2x', unpack('U0U*', $char);
73 }
74}
75
76
77
78
79# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
80# and returns a unicode array.
81sub ascii2unicode {
82 my ($in) = @_;
83 my $out = [];
84
85 my $i = 0;
86 my $len = length($in);
87 while ($i < $len) {
88 push (@$out, ord(substr ($in, $i, 1)));
89 $i++;
90 }
91
92 return $out;
93}
94
95# ascii2utf8 takes a reference to an (extended) ascii string and returns a
96# UTF-8 encoded string. This is just a faster version of
97# "&unicode2utf8(&ascii2unicode($str));"
98# "Extended ascii" really means "iso_8859_1"
99sub ascii2utf8 {
100 my ($in) = @_;
101 my $out = "";
102
103 if (!defined($in)|| !defined($$in)) {
104 return $out;
105 }
106
107 my ($c);
108 my $i = 0;
109 my $len = length($$in);
110 while ($i < $len) {
111 $c = ord (substr ($$in, $i, 1));
112 if ($c < 0x80) {
113 # ascii character
114 $out .= chr ($c);
115
116 } else {
117 # extended ascii character
118 $out .= chr (0xc0 + (($c >> 6) & 0x1f));
119 $out .= chr (0x80 + ($c & 0x3f));
120 }
121 $i++;
122 }
123
124 return $out;
125}
126
127# utf-8 SCHEME:
128#Char. number range | UTF-8 bytes/octets sequence
129# (hexadecimal) | (binary)
130#--------------------+------------------------------------
131#0000 0000 - 0000 007F | 0xxxxxxx
132#0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
133#0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
134#0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
135
136
137# 4 byte code from a web page https://www.perl.com/article/building-a-utf-8-encoder-in-perl/
138# unicode2utf8 takes a unicode array as input and encodes it
139# using utf-8
140sub unicode2utf8 {
141 my ($in) = @_;
142 my $out = "";
143
144 foreach my $num (@$in) {
145 next unless defined $num;
146 if ($num < 0x80) {# 10 000 000
147 # no transformation needed
148 $out .= chr ($num);
149
150 } elsif ($num < 0x800) {# 100 000 000 000 (0x80 to 0x7ff)
151 # populate bitmask 110xxxxx 10xxxxxx with the code point
152 $out .= chr (0xc0 + (($num >> 6) & 0x1f)); # top 5 bits go in first byte ($num >> 6 | 0b11000000)
153 $out .= chr (0x80 + ($num & 0x3f)); # bottom 6 bit go in second byte ($num & 0b00111111 | 0b10000000)
154
155 } elsif ($num < 0x10000) {# 10 000 000 000 000 000 (0x800 to 0xffff)
156 # populate bitmask 1110xxxx 10xxxxxx 10xxxxxx
157 $out .= chr (0xe0 + (($num >> 12) & 0xf)); # top 4 bits ($num >> 12 | 0b11100000)
158 $out .= chr (0x80 + (($num >> 6) & 0x3f)); # next 6 bits ($num >> 6 & 0b00111111 | 0b10000000)
159 $out .= chr (0x80 + ($num & 0x3f)); # last 6 bits ($num & 0b00111111 | 0b10000000)
160
161 } else { # (0x10000 to 0x10ffff)
162 #populate bitmask 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
163 $out .= chr(($num >> 18 )|0b11110000);
164 $out .= chr(($num >> 12) & 0b00111111 | 0b10000000);
165 $out .= chr(($num >> 6 ) & 0b00111111 | 0b10000000);
166 $out .= chr ($num & 0b00111111 | 0b10000000);
167
168 }
169 }
170 return $out;
171}
172
173
174# utf82unicode takes a utf-8 string and produces a unicode
175# array
176sub utf82unicode {
177 my ($in) = @_;
178 my $out = [];
179
180 if(!defined $in) {
181 return $out;
182 }
183
184 my $i = 0;
185 my ($c1, $c2, $c3, $c4);
186 my $len = length($in);
187 while ($i < $len) {
188 if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
189 # normal ascii character
190 push (@$out, $c1);
191
192 } elsif ($c1 < 0xc0) {
193 # error, was expecting the first byte of an
194 # encoded character. Do nothing.
195
196 } elsif ($c1 < 0xe0 && $i+1 < $len) {
197 # an encoded character with two bytes
198 $c2 = ord (substr ($in, $i+1, 1));
199 if ($c2 >= 0x80 && $c2 < 0xc0) {
200 # everything looks ok
201 push (@$out, ((($c1 & 0x1f) << 6) +
202 ($c2 & 0x3f)));
203 $i++; # gobbled an extra byte
204 }
205
206 } elsif ($c1 < 0xf0 && $i+2 < $len) {
207 # an encoded character with three bytes
208 $c2 = ord (substr ($in, $i+1, 1));
209 $c3 = ord (substr ($in, $i+2, 1));
210 if ($c2 >= 0x80 && $c2 < 0xc0 &&
211 $c3 >= 0x80 && $c3 < 0xc0) {
212 # everything looks ok
213 push (@$out, ((($c1 & 0xf) << 12) +
214 (($c2 & 0x3f) << 6) +
215 ($c3 & 0x3f)));
216
217 $i += 2; # gobbled an extra two bytes
218 }
219
220 } elsif ($c1 >= 0xf0 && $i+3 < $len) {
221 # an encoded character with four bytes
222 $c2 = ord (substr ($in, $i+1, 1));
223 $c3 = ord (substr ($in, $i+2, 1));
224 $c4 = ord (substr ($in, $i+2, 1));
225 if ($c2 >= 0x80 && $c2 < 0xc0 &&
226 $c3 >= 0x80 && $c3 < 0xc0 &&
227 $c4 >= 0x80 && $c4 < 0xc0) {
228 # everything looks ok
229 push (@$out, ((($c1 & 0x7) << 18) +
230 (($c2 & 0x3f) << 12) +
231 (($c3 & 0x3f) << 6) +
232 ($c4 & 0x3f)));
233 $i+= 3; # gobbled an extra 3 bytes
234 }
235 } else {
236
237 # error
238 # Do nothing.
239 }
240
241 $i++;
242 }
243
244 return $out;
245}
246
247# unicode2ucs2 takes a unicode array and produces a UCS-2
248# unicode string (every two bytes forms a unicode character)
249sub unicode2ucs2 {
250 my ($in) = @_;
251 my $out = "";
252
253 foreach my $num (@$in) {
254 $out .= chr (($num & 0xff00) >> 8);
255 $out .= chr ($num & 0xff);
256 }
257
258 return $out;
259}
260
261# ucs22unicode takes a UCS-2 string and produces a unicode array
262sub ucs22unicode {
263 my ($in) = @_;
264 my $out = [];
265
266 my $i = 0;
267 my $len = length ($in);
268 while ($i+1 < $len) {
269 push (@$out, ord (substr($in, $i, 1)) << 8 +
270 ord (substr($in, $i+1, 1)));
271
272 $i ++;
273 }
274
275 return $out;
276}
277
278# takes a reference to a string and returns a reference to a unicode array
279sub convert2unicode {
280 my ($encoding, $textref) = @_;
281
282 if (!defined $encodings::encodings->{$encoding}) {
283 print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
284 return [];
285 }
286
287 my $encodename = "$encoding-unicode";
288 my $enc_info = $encodings::encodings->{$encoding};
289 my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
290 "to_uc", $enc_info->{'mapfile'});
291 if (!&loadmapencoding ($encodename, $mapfile)) {
292 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
293 return [];
294 }
295
296 if (defined $enc_info->{'converter'}) {
297 my $converter = $enc_info->{'converter'};
298 return &$converter ($encodename, $textref);
299 }
300
301 if ($unicode::translations{$encodename}->{'count'} == 1) {
302 return &singlebyte2unicode ($encodename, $textref);
303 } else {
304 return &doublebyte2unicode ($encodename, $textref);
305 }
306}
307
308# singlebyte2unicode converts simple 8 bit encodings where characters below
309# 0x80 are normal ascii characters and the rest are decoded using the
310# appropriate mapping files.
311#
312# Examples of encodings that may be converted using singlebyte2unicode are
313# the iso-8859 and windows-125* series.
314sub singlebyte2unicode {
315 my ($encodename, $textref) = @_;
316
317 my @outtext = ();
318 my $len = length($$textref);
319 my ($c);
320 my $i = 0;
321
322 while ($i < $len) {
323 if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
324 # normal ascii character
325 push (@outtext, $c);
326 } else {
327 $c = &transchar ($encodename, $c);
328 # put a black square if cannot translate
329 $c = 0x25A1 if $c == 0;
330 push (@outtext, $c);
331 }
332 $i ++;
333 }
334 return \@outtext;
335}
336
337# doublebyte2unicode converts simple two byte encodings where characters
338# below code point 0x80 are single-byte characters and the rest are
339# double-byte characters.
340#
341# Examples of encodings that may be converted using doublebyte2unicode are
342# CJK encodings like GB encoded Chinese and UHC Korean.
343#
344# Note that no error checking is performed to make sure that the input text
345# is valid for the given encoding.
346#
347# Also, encodings that may contain characters of more than two bytes are
348# not supported (any EUC encoded text may in theory contain 3-byte
349# characters but in practice only one and two byte characters are used).
350sub doublebyte2unicode {
351 my ($encodename, $textref) = @_;
352
353 my @outtext = ();
354 my $len = length($$textref);
355 my ($c1, $c2);
356 my $i = 0;
357
358 while ($i < $len) {
359 if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
360 if ($i+1 < $len) {
361 # double-byte character
362 $c2 = ord(substr($$textref, $i+1, 1));
363 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
364 # put a black square if cannot translate
365 $c = 0x25A1 if $c == 0;
366 push (@outtext, $c);
367 $i += 2;
368
369 } else {
370 # error
371 print STDERR "unicode: ERROR missing second half of double-byte character\n";
372 $i++;
373 }
374
375 } else {
376 # single-byte character
377 push (@outtext, $c1);
378 $i++;
379 }
380 }
381 return \@outtext;
382}
383
384# Shift-JIS to unicode
385# We can't use doublebyte2unicode for Shift-JIS because it uses some
386# single-byte characters above code point 0x80 (i.e. half-width katakana
387# characters in the range 0xA1-0xDF)
388sub shiftjis2unicode {
389 my ($encodename, $textref) = @_;
390
391 my @outtext = ();
392 my $len = length($$textref);
393 my ($c1, $c2);
394 my $i = 0;
395
396 while ($i < $len) {
397 $c1 = ord(substr($$textref, $i, 1));
398
399 if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
400 # Single-byte half-width katakana character or
401 # JIS Roman yen or overline characters
402 my $c = &transchar ($encodename, $c1);
403 # - put a black square if cannot translate
404 $c = 0x25A1 if $c == 0;
405 push (@outtext, $c);
406 $i++;
407
408 } elsif ($c1 < 0x80) {
409 # ASCII
410 push (@outtext, $c1);
411 $i ++;
412
413 } elsif ($c1 < 0xEF) {
414 if ($i+1 < $len) {
415 $c2 = ord(substr($$textref, $i+1, 1));
416 if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
417 # Double-byte shift-jis character
418 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
419 # put a black square if cannot translate
420 $c = 0x25A1 if $c == 0;
421 push (@outtext, $c);
422 } else {
423 # error
424 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
425 }
426 $i += 2;
427 } else {
428 # error
429 print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
430 $i ++;
431 }
432 } else {
433 # error
434 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
435 $i ++;
436 }
437 }
438 return \@outtext;
439}
440
441sub transchar {
442 my ($encoding, $from) = @_;
443 my $high = ($from / 256) % 256;
444 my $low = $from % 256;
445
446 return 0 unless defined $unicode::translations{$encoding};
447
448 my $block = $unicode::translations{$encoding}->{'map'};
449
450 if (ref ($block->[$high]) ne "ARRAY") {
451 return 0;
452 }
453 return $block->[$high]->[$low];
454}
455
456# %translations is of the form:
457#
458# encodings{encodingname-encodingname}->{'map'}->blocktranslation
459# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
460#
461# Any of the top translation blocks can point to an undefined
462# value. This data structure aims to allow fast translation and
463# efficient storage.
464%unicode::translations = ();
465
466# @array256 is used for initialisation, there must be
467# a better way...
468# What about this?: @array256 = (0) x 256;
469@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
470 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
471 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
472 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
473 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
474 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
475 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
476 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
477 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
478 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
479 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
480 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
481 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
482 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
483 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
484 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
485
486# returns 1 if successful, 0 if unsuccessful
487sub loadmapencoding {
488 my ($encoding, $mapfile) = @_;
489
490 # check to see if the encoding has already been loaded
491 return 1 if (defined $unicode::translations{$encoding});
492
493 if (! -r $mapfile || -d $mapfile) {
494 return 0;
495 }
496 return 0 unless open (MAPFILE, $mapfile);
497 binmode (MAPFILE);
498
499 $unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
500 my $block = $unicode::translations{$encoding};
501
502 my ($in,$i,$j);
503 while (1) {
504 my $ret=read(MAPFILE, $in, 1);
505 if (!defined($ret)) { # error
506 print STDERR "unicode.pm: error reading mapfile: $!\n";
507 last;
508 }
509 if ($ret != 1) { last }
510 $i = unpack ("C", $in);
511 $block->{'map'}->[$i] = [@unicode::array256];
512 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
513 my ($n1, $n2) = unpack ("CC", $in);
514 $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
515 }
516 $block->{'count'} ++;
517 }
518
519 close (MAPFILE);
520}
521
522# unicode2singlebyte converts unicode to simple 8 bit encodings where
523# characters below 0x80 are normal ascii characters and the rest are encoded
524# using the appropriate mapping files.
525#
526# Examples of encodings that may be converted using unicode2singlebyte are
527# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
528sub unicode2singlebyte {
529 my ($uniref, $encoding) = @_;
530
531 my $outtext = "";
532 my $encodename = "unicode-$encoding";
533
534 if (!exists $encodings::encodings->{$encoding}) {
535 print STDERR "unicode.pm: ERROR - unsupported encoding "
536 . "'$encoding' requested\n";
537 return "";
538 }
539
540 my $enc_info = $encodings::encodings->{$encoding};
541 my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
542 "from_uc", $enc_info->{'mapfile'});
543 if (!&loadmapencoding ($encodename, $mapfile)) {
544 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
545 return "";
546 }
547
548 foreach my $c (@$uniref) {
549 if ($c < 0x80) {
550 # normal ascii character
551 $outtext .= chr($c);
552 } else {
553 # extended ascii character
554 $c = &transchar ($encodename, $c);
555
556 # put a question mark if cannot translate
557 if ($c == 0) {
558 $outtext .= "?";
559 } else {
560 $outtext .= chr($c);
561 }
562 }
563 }
564 return $outtext;
565}
566
567
568# this makes sure that the referenced input string is utf8 encoded, and
569# will change/remove bytes that aren't.
570# returns 0 if the text was already utf8, or 1 if text modified to become utf8
571sub ensure_utf8 {
572 my $stringref=shift;
573
574 if (!defined($stringref) || ref($stringref) ne 'SCALAR') {
575 return $stringref;
576 }
577
578 my $value=$$stringref;
579
580 my $non_utf8_found = 0;
581 $value =~ m/^/g; # to set \G
582 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
583 my $highbytes=$1;
584 my $highbyteslength=length($highbytes);
585 # make sure this block of high bytes is utf-8
586 $highbytes =~ /^/g; # set pos()
587 my $byte_replaced = 0;
588 while ($highbytes =~
589 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
590 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
591 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
592 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
593 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
594 )*([\x80-\xff])? !xg
595 ) {
596 # this highbyte is "out-of-place" for valid utf-8
597 my $badbyte=$1;
598 if (!defined $badbyte) {next} # hit end of string
599 my $pos=pos($highbytes);
600 # replace bad byte. assume iso-8859-1 -> utf-8
601 # ascii2utf8 does "extended ascii"... ie iso-8859-1
602 my $replacement=&unicode::ascii2utf8(\$badbyte);
603 substr($highbytes, $pos-1, 1, $replacement);
604 # update the position to continue searching (for \G)
605 pos($highbytes) = $pos+length($replacement)-1;
606 $byte_replaced = 1;
607 }
608 if ($byte_replaced) {
609 # replace this block of high bytes in the $value
610 $non_utf8_found = 1;
611 my $replength=length($highbytes); # we've changed the length
612 my $textpos=pos($value); # pos at end of last match
613 # replace bad bytes with good bytes
614 substr($value, $textpos-$highbyteslength,
615 $highbyteslength, $highbytes);
616 # update the position to continue searching (for \G)
617 pos($value)=$textpos+($replength-$highbyteslength)+1;
618 }
619 }
620
621 $$stringref = $value;
622 return $non_utf8_found;
623}
624
625# Returns true (1) if the given string is utf8 and false (0) if it isn't.
626# Does not modify the string parameter.
627sub check_is_utf8 {
628 my $value=shift;
629
630 if (!defined($value)) {
631 return 0; # not utf8 because it is undefined
632 }
633
634 $value =~ m/^/g; # to set \G
635 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
636 my $highbytes=$1;
637 # make sure this block of high bytes is utf-8
638 $highbytes =~ /^/g; # set pos()
639 while ($highbytes =~
640 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
641 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
642 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
643 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
644 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
645 )*([\x80-\xff])? !xg
646 ) {
647 my $badbyte=$1;
648 if (defined $badbyte) { # not end of string
649 return 0; # non-utf8 found
650 }
651 }
652 }
653
654 return 1;
655}
656
657sub url_encode {
658 my ($text) = @_;
659
660 if (!&is_url_encoded($text)) {
661 $text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
662 # return the url-encoded character entity for underscore back to the entity
663 $text =~ s/%26%23095%3B/&\#095;/g;
664 }
665 return $text;
666}
667
668sub url_decode {
669 my ($text,$and_numeric_entities) = @_;
670
671 if(defined $text) {
672 $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
673
674 if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
675 $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
676 $text =~ s/\&\#u?([0-9]+);/pack('C', $1)/ige;
677 }
678 }
679
680 return $text;
681}
682
683sub url_decode_utf8 {
684 my ($text,$and_numeric_entities) = @_;
685
686 $text =~ s/\%([0-9A-F]{2})/pack('b', hex($1))/ige;
687
688 $text = Encode::decode("utf8",$text);
689
690 return $text;
691}
692
693sub is_url_encoded {
694 my ($text) = @_;
695 return ($text =~ m/\%([0-9A-F]{2})/i) || ($text =~ m/\&\#x([0-9A-F]+;)/i) || ($text =~ m/\&\#([0-9]+;)/i);
696}
697
698# When a filename on the filesystem is already URL-encoded, the
699# URL to it will have %25s in place of every % sign, so that
700# URLs in html pages can refer to the URL-encoded filename.
701# This method changes the URL reference back into the actual
702# (URL-encoded) filename on the filesystem by replacing %25 with %.
703sub url_to_filename {
704 my ($text) =@_;
705 $text =~ s/%25/%/g if &is_url_encoded($text);
706 # DM safing would have replaced underscores with character entity &#095;
707 # in SourceFile meta. Undo any such change to get the filename referred to.
708 $text =~ s/&\#095;/_/g;
709 return $text;
710}
711
712# When a filename on the filesystem is already URL-encoded, the
713# URL to it will have %25s in place of every % sign, so that
714# URLs in html pages can refer to the URL-encoded filename.
715# Given a (URL-encoded) filename on the filesystem, this subroutine
716# returns the URL reference string for it by replacing % with %25.
717# The output string will be the same as the input string if the input
718# already contains one or more %25s. This is to prevent processing
719# a url more than once this way.
720sub filename_to_url {
721 my ($text) = @_;
722
723 if($text !~ m/%25/) {
724 $text =~ s/%/%25/g;
725 }
726 return $text;
727}
728
729# Base64 encoding does not encode a pure ASCII to itself. This is important to know.
730# If the $force_encode parameter is true, then this method WILL base64 encode whatever
731# string is passed in, including any plain ASCII string.
732# That means this method could double encode an already encoded string.
733# However, this method is necessary because on the GLI end, we can't detect whether a plain
734# ASCII string has been encoded or not. And if gsdlsourcefilerenamemethod is set to base64,
735# then gli will always attempt to decode all parts of the relative path gsdlsourcefilename
736# (except the "import" prefix, which is special) or none of the parts.
737sub force_base64_encode {
738 my ($text) = @_;
739 my $force_encode = 1;
740 my $no_newlines = 1;
741 return &base64_encode($text, $force_encode, $no_newlines);
742}
743
744sub base64_encode {
745 my ($text, $force_encode, $no_newlines) = @_;
746 if($force_encode || !&conforms_to_mod_base64($text)) {
747 # return entity for underscore to underscore before encoding
748 $text =~ s/&\#095;/_/g;
749
750 $text = &MIME::Base64::encode_base64($text);
751 # base64 encoding may introduce + and / signs,
752 # replacing them with - and _ to ensure it's filename-safe
753 $text =~ s/\+/\-/g; # + -> -
754 $text =~ s/\//\_/g; # / -> _
755
756 # by default base64 encoding a long string introduces newlines to break long strings,
757 # see https://stackoverflow.com/questions/19952621/is-it-ok-to-remove-newline-in-base64-encoding
758 # Java doesn't like that
759 $text =~ s/\n//gs if $no_newlines;
760 }
761 return $text;
762}
763
764# If the input fits the modified base64 pattern, this will try decoding it.
765# Still, this method does not guarantee the return value is the 'original', only
766# that the result is where the base64 decoding process has been applied once.
767# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
768sub base64_decode {
769 my ($text) = @_;
770 if(&conforms_to_mod_base64($text)) {
771 # base64 encodes certain chars with + and /, but if we'd encoded it, we'd
772 # have replaced them with - and _ respectively. Undo this before decoding.
773 $text =~ s/\-/\+/g; # - -> +
774 $text =~ s/\_/\//g; # _ -> /
775 $text = &MIME::Base64::decode_base64($text);
776 }
777 return $text;
778}
779
780# Returns true if the given string is compatible with a modified version
781# of base64 (where the + and / are replaced with - and _), a format which
782# includes also regular ASCII alphanumeric values. This method does not
783# guarantee that the given string is actually base64 encoded, since it will
784# return true for any simple alphanumeric ASCII string as well.
785sub conforms_to_mod_base64 {
786 my ($text) = @_;
787
788 # need to treat the entity ref for underscore as underscore
789 $text =~ s/&\#095;/_/g;
790
791 # base 64 takes alphanumeric and [=+/],
792 # but we use modified base64 where + and / are replaced with - and _
793 return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
794}
795
796sub substr
797{
798 my ($utf8_string, $offset, $length) = @_;
799
800 my @unicode_string = @{&utf82unicode($utf8_string)};
801 my $unicode_string_length = scalar(@unicode_string);
802
803 my $substr_start = $offset;
804 if ($substr_start >= $unicode_string_length) {
805 return "";
806 }
807
808 my $substr_end = $offset + $length - 1;
809 if ($substr_end >= $unicode_string_length) {
810 $substr_end = $unicode_string_length - 1;
811 }
812
813 my @unicode_substring = @unicode_string[$substr_start..$substr_end];
814 return &unicode2utf8(\@unicode_substring);
815}
816
817# perl version of stringToHex
818# Useful method to print UTF8 (or other unicode) for debugging.
819# Characters that are easily displayed (that is, printable ASCII)
820# are shown as-is, whereas hex values of the unicode code points
821# are shown for all other chars.
822sub debug_unicode_string
823{
824 join("",
825 map { $_ > 127 ? # if wide character...
826 sprintf("\\x{%04X}", $_) : # \x{...}
827 chr($_)
828 } unpack("U*", $_[0])); # unpack Unicode characters
829}
830
831sub raw_filename_to_url_encoded
832{
833 my ($str_in) = @_;
834
835 my @url_encoded_chars
836 = map { $_ > 255 ? # Needs to be represent in entity form
837 sprintf("&#x%X;",$_) :
838 $_>127 || $_==ord("%") ? # Representable in %XX form
839 sprintf("%%%2X", $_) :
840 chr($_) # otherwise, Ascii char
841 } unpack("U*", $str_in); # Unpack Unicode characters
842
843
844 my $str_out = join("", @url_encoded_chars);
845
846 return $str_out;
847
848}
849
850sub url_encoded_to_raw_filename
851{
852 my ($str_in) = @_;
853
854 my $str_out = $str_in;
855
856 $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
857 $str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
858 $str_out =~ s/&#([0-9]+);/chr($1)/eig;
859
860 return $str_out;
861}
862
863
864sub raw_filename_to_utf8_url_encoded
865{
866 my ($str_in) = @_;
867
868 $str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
869
870 my @url_encoded_chars
871 = map { $_ > 127 ? # Representable in %XX form
872 sprintf("%%%2X", $_) :
873 chr($_) # otherwise, Ascii char
874 } unpack("U*", $str_in); # Unpack utf8 characters
875
876
877 my $str_out = join("", @url_encoded_chars);
878
879 return $str_out;
880
881}
882
883sub utf8_url_encoded_to_raw_filename
884{
885 my ($str_in) = @_;
886
887 my $utf8_str_out = $str_in;
888
889 $utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
890
891 my $unicode_str_out = decode("utf8",$utf8_str_out);
892 my $raw_str_out = utf8::downgrade($unicode_str_out);
893
894 return $raw_str_out;
895}
896
897sub analyze_raw_string
898{
899 my ($str_in) = @_;
900
901 my $uses_bytecodes = 0;
902 my $exceeds_bytecodes = 0;
903
904 map { $exceeds_bytecodes = 1 if ($_ >= 256);
905 $uses_bytecodes = 1 if (($_ >= 128) && ($_ < 256));
906 } unpack("U*", $str_in); # Unpack Unicode characters
907
908 return ($uses_bytecodes,$exceeds_bytecodes);
909}
910
911
912sub convert_utf8_string_to_unicode_string
913{
914 my $utf8_string = shift(@_);
915
916 my $unicode_string = "";
917 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
918 $unicode_string .= chr($unicode_value);
919 }
920 return $unicode_string;
921}
922
923sub convert_unicode_string_to_utf8_string
924{
925 my $unicode_string = shift(@_);
926
927 my @unicode_array;
928 for (my $i = 0; $i < length($unicode_string); $i++) {
929 push(@unicode_array, ord(&substr($unicode_string, $i, 1)));
930 }
931 return &unicode::unicode2utf8(\@unicode_array);
932}
933
934
9351;
Note: See TracBrowser for help on using the repository browser.