source: main/trunk/greenstone2/perllib/unicode.pm@ 33757

Last change on this file since 33757 was 33757, checked in by ak19, 4 years ago
  1. Windows bugfix for getting exMeta to be loaded into GLI where there are subdirs involved in the Gather pane, or there are non-ASCII filenames, or the file rename method is set to base64. 2. Bugfix for Linux and Windows: Using Base64 to rename files was still a problem despite the previous commit (which was supposed to have fixed all GLI exMeta loading issues on Linux) in the special case where a subfolder was pure ASCII. The perl code wouldn't base64 encode such subdirs. However, GLI won't know which part of a relative file path to decode based on the file rename method used and which parts are not to be decoded. So GLI uniformly decoded them, and ASCII named subfolders that were not base64 encoded (but contained files that were to be renamed with base64) got base64 decoded into garbage, so that exMeta still did not get attached. 3. This commit contains debug stmts.
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 24.5 KB
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2004 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33eval {require bytes};
34
35use encodings;
36use strict;
37use util;
38use FileUtils;
39use MIME::Base64; # for base64 encoding
40
41no strict 'refs';
42
43
44
45sub utf8decomp
46{
47 my ($str) = @_;
48
49 return if (!defined $str);
50 return "" if ($str eq "");
51
52 my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
53
54 my @each_char
55 = map { ($_ > 255 )
56 ? # if wide character...
57 sprintf("\\x{%04X}", $_)
58 : # \x{...}
59 (chr($_) =~ m/[[:cntrl:]]/ )
60 ? # else if control character ...
61 sprintf("\\x%02X", $_)
62 : # \x..
63 quotemeta(chr($_)) # else quoted or as themselves
64 } @unpacked_chars;
65
66 return join("",@each_char);
67}
68
69
70sub hex_codepoint {
71 if (my $char = shift) {
72 return sprintf '%2.2x', unpack('U0U*', $char);
73 }
74}
75
76
77
78
79# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
80# and returns a unicode array.
81sub ascii2unicode {
82 my ($in) = @_;
83 my $out = [];
84
85 my $i = 0;
86 my $len = length($in);
87 while ($i < $len) {
88 push (@$out, ord(substr ($in, $i, 1)));
89 $i++;
90 }
91
92 return $out;
93}
94
95# ascii2utf8 takes a reference to an (extended) ascii string and returns a
96# UTF-8 encoded string. This is just a faster version of
97# "&unicode2utf8(&ascii2unicode($str));"
98# "Extended ascii" really means "iso_8859_1"
99sub ascii2utf8 {
100 my ($in) = @_;
101 my $out = "";
102
103 if (!defined($in)|| !defined($$in)) {
104 return $out;
105 }
106
107 my ($c);
108 my $i = 0;
109 my $len = length($$in);
110 while ($i < $len) {
111 $c = ord (substr ($$in, $i, 1));
112 if ($c < 0x80) {
113 # ascii character
114 $out .= chr ($c);
115
116 } else {
117 # extended ascii character
118 $out .= chr (0xc0 + (($c >> 6) & 0x1f));
119 $out .= chr (0x80 + ($c & 0x3f));
120 }
121 $i++;
122 }
123
124 return $out;
125}
126
127# unicode2utf8 takes a unicode array as input and encodes it
128# using utf-8
129sub unicode2utf8 {
130 my ($in) = @_;
131 my $out = "";
132
133 foreach my $num (@$in) {
134 next unless defined $num;
135 if ($num < 0x80) {
136 $out .= chr ($num);
137
138 } elsif ($num < 0x800) {
139 $out .= chr (0xc0 + (($num >> 6) & 0x1f));
140 $out .= chr (0x80 + ($num & 0x3f));
141
142 } elsif ($num < 0xFFFF) {
143 $out .= chr (0xe0 + (($num >> 12) & 0xf));
144 $out .= chr (0x80 + (($num >> 6) & 0x3f));
145 $out .= chr (0x80 + ($num & 0x3f));
146
147 } else {
148 # error, don't encode anything
149 #die;
150 # Diego's bugfix: instead of aborting the import process, it
151 # is better to get a converted file with a few extra spaces
152 print STDERR "strange char: $num\n";
153 $out .= " ";
154
155 }
156 }
157 return $out;
158}
159
160# utf82unicode takes a utf-8 string and produces a unicode
161# array
162sub utf82unicode {
163 my ($in) = @_;
164 my $out = [];
165
166 if(!defined $in) {
167 return $out;
168 }
169
170 my $i = 0;
171 my ($c1, $c2, $c3);
172 my $len = length($in);
173 while ($i < $len) {
174 if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
175 # normal ascii character
176 push (@$out, $c1);
177
178 } elsif ($c1 < 0xc0) {
179 # error, was expecting the first byte of an
180 # encoded character. Do nothing.
181
182 } elsif ($c1 < 0xe0 && $i+1 < $len) {
183 # an encoded character with two bytes
184 $c2 = ord (substr ($in, $i+1, 1));
185 if ($c2 >= 0x80 && $c2 < 0xc0) {
186 # everything looks ok
187 push (@$out, ((($c1 & 0x1f) << 6) +
188 ($c2 & 0x3f)));
189 $i++; # gobbled an extra byte
190 }
191
192 } elsif ($c1 < 0xf0 && $i+2 < $len) {
193 # an encoded character with three bytes
194 $c2 = ord (substr ($in, $i+1, 1));
195 $c3 = ord (substr ($in, $i+2, 1));
196 if ($c2 >= 0x80 && $c2 < 0xc0 &&
197 $c3 >= 0x80 && $c3 < 0xc0) {
198 # everything looks ok
199 push (@$out, ((($c1 & 0xf) << 12) +
200 (($c2 & 0x3f) << 6) +
201 ($c3 & 0x3f)));
202
203 $i += 2; # gobbled an extra two bytes
204 }
205
206 } else {
207 # error, only decode Unicode characters not full UCS.
208 # Do nothing.
209 }
210
211 $i++;
212 }
213
214 return $out;
215}
216
217# unicode2ucs2 takes a unicode array and produces a UCS-2
218# unicode string (every two bytes forms a unicode character)
219sub unicode2ucs2 {
220 my ($in) = @_;
221 my $out = "";
222
223 foreach my $num (@$in) {
224 $out .= chr (($num & 0xff00) >> 8);
225 $out .= chr ($num & 0xff);
226 }
227
228 return $out;
229}
230
231# ucs22unicode takes a UCS-2 string and produces a unicode array
232sub ucs22unicode {
233 my ($in) = @_;
234 my $out = [];
235
236 my $i = 0;
237 my $len = length ($in);
238 while ($i+1 < $len) {
239 push (@$out, ord (substr($in, $i, 1)) << 8 +
240 ord (substr($in, $i+1, 1)));
241
242 $i ++;
243 }
244
245 return $out;
246}
247
248# takes a reference to a string and returns a reference to a unicode array
249sub convert2unicode {
250 my ($encoding, $textref) = @_;
251
252 if (!defined $encodings::encodings->{$encoding}) {
253 print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
254 return [];
255 }
256
257 my $encodename = "$encoding-unicode";
258 my $enc_info = $encodings::encodings->{$encoding};
259 my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
260 "to_uc", $enc_info->{'mapfile'});
261 if (!&loadmapencoding ($encodename, $mapfile)) {
262 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
263 return [];
264 }
265
266 if (defined $enc_info->{'converter'}) {
267 my $converter = $enc_info->{'converter'};
268 return &$converter ($encodename, $textref);
269 }
270
271 if ($unicode::translations{$encodename}->{'count'} == 1) {
272 return &singlebyte2unicode ($encodename, $textref);
273 } else {
274 return &doublebyte2unicode ($encodename, $textref);
275 }
276}
277
278# singlebyte2unicode converts simple 8 bit encodings where characters below
279# 0x80 are normal ascii characters and the rest are decoded using the
280# appropriate mapping files.
281#
282# Examples of encodings that may be converted using singlebyte2unicode are
283# the iso-8859 and windows-125* series.
284sub singlebyte2unicode {
285 my ($encodename, $textref) = @_;
286
287 my @outtext = ();
288 my $len = length($$textref);
289 my ($c);
290 my $i = 0;
291
292 while ($i < $len) {
293 if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
294 # normal ascii character
295 push (@outtext, $c);
296 } else {
297 $c = &transchar ($encodename, $c);
298 # put a black square if cannot translate
299 $c = 0x25A1 if $c == 0;
300 push (@outtext, $c);
301 }
302 $i ++;
303 }
304 return \@outtext;
305}
306
307# doublebyte2unicode converts simple two byte encodings where characters
308# below code point 0x80 are single-byte characters and the rest are
309# double-byte characters.
310#
311# Examples of encodings that may be converted using doublebyte2unicode are
312# CJK encodings like GB encoded Chinese and UHC Korean.
313#
314# Note that no error checking is performed to make sure that the input text
315# is valid for the given encoding.
316#
317# Also, encodings that may contain characters of more than two bytes are
318# not supported (any EUC encoded text may in theory contain 3-byte
319# characters but in practice only one and two byte characters are used).
320sub doublebyte2unicode {
321 my ($encodename, $textref) = @_;
322
323 my @outtext = ();
324 my $len = length($$textref);
325 my ($c1, $c2);
326 my $i = 0;
327
328 while ($i < $len) {
329 if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
330 if ($i+1 < $len) {
331 # double-byte character
332 $c2 = ord(substr($$textref, $i+1, 1));
333 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
334 # put a black square if cannot translate
335 $c = 0x25A1 if $c == 0;
336 push (@outtext, $c);
337 $i += 2;
338
339 } else {
340 # error
341 print STDERR "unicode: ERROR missing second half of double-byte character\n";
342 $i++;
343 }
344
345 } else {
346 # single-byte character
347 push (@outtext, $c1);
348 $i++;
349 }
350 }
351 return \@outtext;
352}
353
354# Shift-JIS to unicode
355# We can't use doublebyte2unicode for Shift-JIS because it uses some
356# single-byte characters above code point 0x80 (i.e. half-width katakana
357# characters in the range 0xA1-0xDF)
358sub shiftjis2unicode {
359 my ($encodename, $textref) = @_;
360
361 my @outtext = ();
362 my $len = length($$textref);
363 my ($c1, $c2);
364 my $i = 0;
365
366 while ($i < $len) {
367 $c1 = ord(substr($$textref, $i, 1));
368
369 if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
370 # Single-byte half-width katakana character or
371 # JIS Roman yen or overline characters
372 my $c = &transchar ($encodename, $c1);
373 # - put a black square if cannot translate
374 $c = 0x25A1 if $c == 0;
375 push (@outtext, $c);
376 $i++;
377
378 } elsif ($c1 < 0x80) {
379 # ASCII
380 push (@outtext, $c1);
381 $i ++;
382
383 } elsif ($c1 < 0xEF) {
384 if ($i+1 < $len) {
385 $c2 = ord(substr($$textref, $i+1, 1));
386 if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
387 # Double-byte shift-jis character
388 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
389 # put a black square if cannot translate
390 $c = 0x25A1 if $c == 0;
391 push (@outtext, $c);
392 } else {
393 # error
394 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
395 }
396 $i += 2;
397 } else {
398 # error
399 print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
400 $i ++;
401 }
402 } else {
403 # error
404 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
405 $i ++;
406 }
407 }
408 return \@outtext;
409}
410
411sub transchar {
412 my ($encoding, $from) = @_;
413 my $high = ($from / 256) % 256;
414 my $low = $from % 256;
415
416 return 0 unless defined $unicode::translations{$encoding};
417
418 my $block = $unicode::translations{$encoding}->{'map'};
419
420 if (ref ($block->[$high]) ne "ARRAY") {
421 return 0;
422 }
423 return $block->[$high]->[$low];
424}
425
426# %translations is of the form:
427#
428# encodings{encodingname-encodingname}->{'map'}->blocktranslation
429# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
430#
431# Any of the top translation blocks can point to an undefined
432# value. This data structure aims to allow fast translation and
433# efficient storage.
434%unicode::translations = ();
435
436# @array256 is used for initialisation, there must be
437# a better way...
438# What about this?: @array256 = (0) x 256;
439@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
440 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
441 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
442 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
443 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
444 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
445 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
446 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
447 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
448 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
449 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
450 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
451 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
452 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
453 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
454 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
455
456# returns 1 if successful, 0 if unsuccessful
457sub loadmapencoding {
458 my ($encoding, $mapfile) = @_;
459
460 # check to see if the encoding has already been loaded
461 return 1 if (defined $unicode::translations{$encoding});
462
463 if (! -r $mapfile || -d $mapfile) {
464 return 0;
465 }
466 return 0 unless open (MAPFILE, $mapfile);
467 binmode (MAPFILE);
468
469 $unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
470 my $block = $unicode::translations{$encoding};
471
472 my ($in,$i,$j);
473 while (1) {
474 my $ret=read(MAPFILE, $in, 1);
475 if (!defined($ret)) { # error
476 print STDERR "unicode.pm: error reading mapfile: $!\n";
477 last;
478 }
479 if ($ret != 1) { last }
480 $i = unpack ("C", $in);
481 $block->{'map'}->[$i] = [@unicode::array256];
482 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
483 my ($n1, $n2) = unpack ("CC", $in);
484 $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
485 }
486 $block->{'count'} ++;
487 }
488
489 close (MAPFILE);
490}
491
492# unicode2singlebyte converts unicode to simple 8 bit encodings where
493# characters below 0x80 are normal ascii characters and the rest are encoded
494# using the appropriate mapping files.
495#
496# Examples of encodings that may be converted using unicode2singlebyte are
497# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
498sub unicode2singlebyte {
499 my ($uniref, $encoding) = @_;
500
501 my $outtext = "";
502 my $encodename = "unicode-$encoding";
503
504 if (!exists $encodings::encodings->{$encoding}) {
505 print STDERR "unicode.pm: ERROR - unsupported encoding "
506 . "'$encoding' requested\n";
507 return "";
508 }
509
510 my $enc_info = $encodings::encodings->{$encoding};
511 my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
512 "from_uc", $enc_info->{'mapfile'});
513 if (!&loadmapencoding ($encodename, $mapfile)) {
514 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
515 return "";
516 }
517
518 foreach my $c (@$uniref) {
519 if ($c < 0x80) {
520 # normal ascii character
521 $outtext .= chr($c);
522 } else {
523 # extended ascii character
524 $c = &transchar ($encodename, $c);
525
526 # put a question mark if cannot translate
527 if ($c == 0) {
528 $outtext .= "?";
529 } else {
530 $outtext .= chr($c);
531 }
532 }
533 }
534 return $outtext;
535}
536
537
538# this makes sure that the referenced input string is utf8 encoded, and
539# will change/remove bytes that aren't.
540# returns 0 if the text was already utf8, or 1 if text modified to become utf8
541sub ensure_utf8 {
542 my $stringref=shift;
543
544 if (!defined($stringref) || ref($stringref) ne 'SCALAR') {
545 return $stringref;
546 }
547
548 my $value=$$stringref;
549
550 my $non_utf8_found = 0;
551 $value =~ m/^/g; # to set \G
552 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
553 my $highbytes=$1;
554 my $highbyteslength=length($highbytes);
555 # make sure this block of high bytes is utf-8
556 $highbytes =~ /^/g; # set pos()
557 my $byte_replaced = 0;
558 while ($highbytes =~
559 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
560 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
561 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
562 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
563 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
564 )*([\x80-\xff])? !xg
565 ) {
566 # this highbyte is "out-of-place" for valid utf-8
567 my $badbyte=$1;
568 if (!defined $badbyte) {next} # hit end of string
569 my $pos=pos($highbytes);
570 # replace bad byte. assume iso-8859-1 -> utf-8
571 # ascii2utf8 does "extended ascii"... ie iso-8859-1
572 my $replacement=&unicode::ascii2utf8(\$badbyte);
573 substr($highbytes, $pos-1, 1, $replacement);
574 # update the position to continue searching (for \G)
575 pos($highbytes) = $pos+length($replacement)-1;
576 $byte_replaced = 1;
577 }
578 if ($byte_replaced) {
579 # replace this block of high bytes in the $value
580 $non_utf8_found = 1;
581 my $replength=length($highbytes); # we've changed the length
582 my $textpos=pos($value); # pos at end of last match
583 # replace bad bytes with good bytes
584 substr($value, $textpos-$highbyteslength,
585 $highbyteslength, $highbytes);
586 # update the position to continue searching (for \G)
587 pos($value)=$textpos+($replength-$highbyteslength)+1;
588 }
589 }
590
591 $$stringref = $value;
592 return $non_utf8_found;
593}
594
595# Returns true (1) if the given string is utf8 and false (0) if it isn't.
596# Does not modify the string parameter.
597sub check_is_utf8 {
598 my $value=shift;
599
600 if (!defined($value)) {
601 return 0; # not utf8 because it is undefined
602 }
603
604 $value =~ m/^/g; # to set \G
605 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
606 my $highbytes=$1;
607 # make sure this block of high bytes is utf-8
608 $highbytes =~ /^/g; # set pos()
609 while ($highbytes =~
610 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
611 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
612 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
613 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
614 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
615 )*([\x80-\xff])? !xg
616 ) {
617 my $badbyte=$1;
618 if (defined $badbyte) { # not end of string
619 return 0; # non-utf8 found
620 }
621 }
622 }
623
624 return 1;
625}
626
627sub url_encode {
628 my ($text) = @_;
629
630 if (!&is_url_encoded($text)) {
631 $text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
632 # return the url-encoded character entity for underscore back to the entity
633 $text =~ s/%26%23095%3B/&\#095;/g;
634 }
635 return $text;
636}
637
638sub url_decode {
639 my ($text,$and_numeric_entities) = @_;
640
641 if(defined $text) {
642 $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
643
644 if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
645 $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
646 $text =~ s/\&\#u?([0-9]+);/pack('C', $1)/ige;
647 }
648 }
649
650 return $text;
651}
652
653sub url_decode_utf8 {
654 my ($text,$and_numeric_entities) = @_;
655
656 $text =~ s/\%([0-9A-F]{2})/pack('b', hex($1))/ige;
657
658 $text = Encode::decode("utf8",$text);
659
660 return $text;
661}
662
663sub is_url_encoded {
664 my ($text) = @_;
665 return ($text =~ m/\%([0-9A-F]{2})/i) || ($text =~ m/\&\#x([0-9A-F]+;)/i) || ($text =~ m/\&\#([0-9]+;)/i);
666}
667
668# When a filename on the filesystem is already URL-encoded, the
669# URL to it will have %25s in place of every % sign, so that
670# URLs in html pages can refer to the URL-encoded filename.
671# This method changes the URL reference back into the actual
672# (URL-encoded) filename on the filesystem by replacing %25 with %.
673sub url_to_filename {
674 my ($text) =@_;
675 $text =~ s/%25/%/g if &is_url_encoded($text);
676 # DM safing would have replaced underscores with character entity &#095;
677 # in SourceFile meta. Undo any such change to get the filename referred to.
678 $text =~ s/&\#095;/_/g;
679 return $text;
680}
681
682# When a filename on the filesystem is already URL-encoded, the
683# URL to it will have %25s in place of every % sign, so that
684# URLs in html pages can refer to the URL-encoded filename.
685# Given a (URL-encoded) filename on the filesystem, this subroutine
686# returns the URL reference string for it by replacing % with %25.
687# The output string will be the same as the input string if the input
688# already contains one or more %25s. This is to prevent processing
689# a url more than once this way.
690sub filename_to_url {
691 my ($text) = @_;
692
693 if($text !~ m/%25/) {
694 $text =~ s/%/%25/g;
695 }
696 return $text;
697}
698
699# Base64 encoding does not encode a pure ASCII to itself. This is important to know.
700# If the $force_encode parameter is true, then this method WILL base64 encode whatever
701# string is passed in, including any plain ASCII string.
702# That means this method could double encode an already encoded string.
703# However, this method is necessary because on the GLI end, we can't detect whether a plain
704# ASCII string has been encoded or not. And if gsdlsourcefilerenamemethod is set to base64,
705# then gli will always attempt to decode all parts of the relative path gsdlsourcefilename
706# (except the "import" prefix, which is special) or none of the parts.
707sub force_base64_encode {
708 my ($text) = @_;
709 my $force_encode = 1;
710 return &base64_encode($text, $force_encode);
711}
712
713sub base64_encode {
714 my ($text, $force_encode) = @_;
715 if($force_encode || !&conforms_to_mod_base64($text)) {
716 # return entity for underscore to underscore before encoding
717 $text =~ s/&\#095;/_/g;
718
719 $text = &MIME::Base64::encode_base64($text);
720 # base64 encoding may introduce + and / signs,
721 # replacing them with - and _ to ensure it's filename-safe
722 $text =~ s/\+/\-/g; # + -> -
723 $text =~ s/\//\_/g; # / -> _
724 }
725 return $text;
726}
727
728# If the input fits the modified base64 pattern, this will try decoding it.
729# Still, this method does not guarantee the return value is the 'original', only
730# that the result is where the base64 decoding process has been applied once.
731# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
732sub base64_decode {
733 my ($text) = @_;
734 if(&conforms_to_mod_base64($text)) {
735 # base64 encodes certain chars with + and /, but if we'd encoded it, we'd
736 # have replaced them with - and _ respectively. Undo this before decoding.
737 $text =~ s/\-/\+/g; # - -> +
738 $text =~ s/\_/\//g; # _ -> /
739 $text = &MIME::Base64::decode_base64($text);
740 }
741 return $text;
742}
743
744# Returns true if the given string is compatible with a modified version
745# of base64 (where the + and / are replaced with - and _), a format which
746# includes also regular ASCII alphanumeric values. This method does not
747# guarantee that the given string is actually base64 encoded, since it will
748# return true for any simple alphanumeric ASCII string as well.
749sub conforms_to_mod_base64 {
750 my ($text) = @_;
751
752 # need to treat the entity ref for underscore as underscore
753 $text =~ s/&\#095;/_/g;
754
755 # base 64 takes alphanumeric and [=+/],
756 # but we use modified base64 where + and / are replaced with - and _
757 return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
758}
759
760sub substr
761{
762 my ($utf8_string, $offset, $length) = @_;
763
764 my @unicode_string = @{&utf82unicode($utf8_string)};
765 my $unicode_string_length = scalar(@unicode_string);
766
767 my $substr_start = $offset;
768 if ($substr_start >= $unicode_string_length) {
769 return "";
770 }
771
772 my $substr_end = $offset + $length - 1;
773 if ($substr_end >= $unicode_string_length) {
774 $substr_end = $unicode_string_length - 1;
775 }
776
777 my @unicode_substring = @unicode_string[$substr_start..$substr_end];
778 return &unicode2utf8(\@unicode_substring);
779}
780
781# perl version of stringToHex
782# Useful method to print UTF8 (or other unicode) for debugging.
783# Characters that are easily displayed (that is, printable ASCII)
784# are shown as-is, whereas hex values of the unicode code points
785# are shown for all other chars.
786sub debug_unicode_string
787{
788 join("",
789 map { $_ > 127 ? # if wide character...
790 sprintf("\\x{%04X}", $_) : # \x{...}
791 chr($_)
792 } unpack("U*", $_[0])); # unpack Unicode characters
793}
794
795
796sub raw_filename_to_url_encoded
797{
798 my ($str_in) = @_;
799
800 my @url_encoded_chars
801 = map { $_ > 255 ? # Needs to be represent in entity form
802 sprintf("&#x%X;",$_) :
803 $_>127 || $_==ord("%") ? # Representable in %XX form
804 sprintf("%%%2X", $_) :
805 chr($_) # otherwise, Ascii char
806 } unpack("U*", $str_in); # Unpack Unicode characters
807
808
809 my $str_out = join("", @url_encoded_chars);
810
811 return $str_out;
812
813}
814
815sub url_encoded_to_raw_filename
816{
817 my ($str_in) = @_;
818
819 my $str_out = $str_in;
820
821 $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
822 $str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
823 $str_out =~ s/&#([0-9]+);/chr($1)/eig;
824
825 return $str_out;
826}
827
828
829sub raw_filename_to_utf8_url_encoded
830{
831 my ($str_in) = @_;
832
833 $str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
834
835 my @url_encoded_chars
836 = map { $_ > 127 ? # Representable in %XX form
837 sprintf("%%%2X", $_) :
838 chr($_) # otherwise, Ascii char
839 } unpack("U*", $str_in); # Unpack utf8 characters
840
841
842 my $str_out = join("", @url_encoded_chars);
843
844 return $str_out;
845
846}
847
848sub utf8_url_encoded_to_raw_filename
849{
850 my ($str_in) = @_;
851
852 my $utf8_str_out = $str_in;
853
854 $utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
855
856 my $unicode_str_out = decode("utf8",$utf8_str_out);
857 my $raw_str_out = utf8::downgrade($unicode_str_out);
858
859 return $raw_str_out;
860}
861
862sub analyze_raw_string
863{
864 my ($str_in) = @_;
865
866 my $uses_bytecodes = 0;
867 my $exceeds_bytecodes = 0;
868
869 map { $exceeds_bytecodes = 1 if ($_ >= 256);
870 $uses_bytecodes = 1 if (($_ >= 128) && ($_ < 256));
871 } unpack("U*", $str_in); # Unpack Unicode characters
872
873 return ($uses_bytecodes,$exceeds_bytecodes);
874}
875
876
877sub convert_utf8_string_to_unicode_string
878{
879 my $utf8_string = shift(@_);
880
881 my $unicode_string = "";
882 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
883 $unicode_string .= chr($unicode_value);
884 }
885 return $unicode_string;
886}
887
888sub convert_unicode_string_to_utf8_string
889{
890 my $unicode_string = shift(@_);
891
892 my @unicode_array;
893 for (my $i = 0; $i < length($unicode_string); $i++) {
894 push(@unicode_array, ord(&substr($unicode_string, $i, 1)));
895 }
896 return &unicode::unicode2utf8(\@unicode_array);
897}
898
899
9001;
Note: See TracBrowser for help on using the repository browser.