source: main/trunk/greenstone2/perllib/unicode.pm@ 27333

Last change on this file since 27333 was 27333, checked in by ak19, 11 years ago
  1. Added the metadataaction::get_import_metadata_array() subroutine for completion and tested it. 2. A couple of unicode subroutines print warnings when parameters are undefined. Since they can be undefined sometimes, added a test for this so as to avoid the unnecessary warnings.
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 23.5 KB
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2004 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33eval {require bytes};
34
35use encodings;
36use strict;
37use util;
38use FileUtils;
39use MIME::Base64; # for base64 encoding
40
41no strict 'refs';
42
43
44
45sub utf8decomp
46{
47 my ($str) = @_;
48
49 return if (!defined $str);
50 return "" if ($str eq "");
51
52 my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
53
54 my @each_char
55 = map { ($_ > 255 )
56 ? # if wide character...
57 sprintf("\\x{%04X}", $_)
58 : # \x{...}
59 (chr($_) =~ m/[[:cntrl:]]/ )
60 ? # else if control character ...
61 sprintf("\\x%02X", $_)
62 : # \x..
63 quotemeta(chr($_)) # else quoted or as themselves
64 } @unpacked_chars;
65
66 return join("",@each_char);
67}
68
69
70sub hex_codepoint {
71 if (my $char = shift) {
72 return sprintf '%2.2x', unpack('U0U*', $char);
73 }
74}
75
76
77
78
79# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
80# and returns a unicode array.
81sub ascii2unicode {
82 my ($in) = @_;
83 my $out = [];
84
85 my $i = 0;
86 my $len = length($in);
87 while ($i < $len) {
88 push (@$out, ord(substr ($in, $i, 1)));
89 $i++;
90 }
91
92 return $out;
93}
94
95# ascii2utf8 takes a reference to an (extended) ascii string and returns a
96# UTF-8 encoded string. This is just a faster version of
97# "&unicode2utf8(&ascii2unicode($str));"
98# "Extended ascii" really means "iso_8859_1"
99sub ascii2utf8 {
100 my ($in) = @_;
101 my $out = "";
102
103 if (!defined($in)|| !defined($$in)) {
104 return $out;
105 }
106
107 my ($c);
108 my $i = 0;
109 my $len = length($$in);
110 while ($i < $len) {
111 $c = ord (substr ($$in, $i, 1));
112 if ($c < 0x80) {
113 # ascii character
114 $out .= chr ($c);
115
116 } else {
117 # extended ascii character
118 $out .= chr (0xc0 + (($c >> 6) & 0x1f));
119 $out .= chr (0x80 + ($c & 0x3f));
120 }
121 $i++;
122 }
123
124 return $out;
125}
126
127# unicode2utf8 takes a unicode array as input and encodes it
128# using utf-8
129sub unicode2utf8 {
130 my ($in) = @_;
131 my $out = "";
132
133 foreach my $num (@$in) {
134 next unless defined $num;
135 if ($num < 0x80) {
136 $out .= chr ($num);
137
138 } elsif ($num < 0x800) {
139 $out .= chr (0xc0 + (($num >> 6) & 0x1f));
140 $out .= chr (0x80 + ($num & 0x3f));
141
142 } elsif ($num < 0xFFFF) {
143 $out .= chr (0xe0 + (($num >> 12) & 0xf));
144 $out .= chr (0x80 + (($num >> 6) & 0x3f));
145 $out .= chr (0x80 + ($num & 0x3f));
146
147 } else {
148 # error, don't encode anything
149 die;
150 }
151 }
152 return $out;
153}
154
155# utf82unicode takes a utf-8 string and produces a unicode
156# array
157sub utf82unicode {
158 my ($in) = @_;
159 my $out = [];
160
161 if(!defined $in) {
162 return $out;
163 }
164
165 my $i = 0;
166 my ($c1, $c2, $c3);
167 my $len = length($in);
168 while ($i < $len) {
169 if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
170 # normal ascii character
171 push (@$out, $c1);
172
173 } elsif ($c1 < 0xc0) {
174 # error, was expecting the first byte of an
175 # encoded character. Do nothing.
176
177 } elsif ($c1 < 0xe0 && $i+1 < $len) {
178 # an encoded character with two bytes
179 $c2 = ord (substr ($in, $i+1, 1));
180 if ($c2 >= 0x80 && $c2 < 0xc0) {
181 # everything looks ok
182 push (@$out, ((($c1 & 0x1f) << 6) +
183 ($c2 & 0x3f)));
184 $i++; # gobbled an extra byte
185 }
186
187 } elsif ($c1 < 0xf0 && $i+2 < $len) {
188 # an encoded character with three bytes
189 $c2 = ord (substr ($in, $i+1, 1));
190 $c3 = ord (substr ($in, $i+2, 1));
191 if ($c2 >= 0x80 && $c2 < 0xc0 &&
192 $c3 >= 0x80 && $c3 < 0xc0) {
193 # everything looks ok
194 push (@$out, ((($c1 & 0xf) << 12) +
195 (($c2 & 0x3f) << 6) +
196 ($c3 & 0x3f)));
197
198 $i += 2; # gobbled an extra two bytes
199 }
200
201 } else {
202 # error, only decode Unicode characters not full UCS.
203 # Do nothing.
204 }
205
206 $i++;
207 }
208
209 return $out;
210}
211
212# unicode2ucs2 takes a unicode array and produces a UCS-2
213# unicode string (every two bytes forms a unicode character)
214sub unicode2ucs2 {
215 my ($in) = @_;
216 my $out = "";
217
218 foreach my $num (@$in) {
219 $out .= chr (($num & 0xff00) >> 8);
220 $out .= chr ($num & 0xff);
221 }
222
223 return $out;
224}
225
226# ucs22unicode takes a UCS-2 string and produces a unicode array
227sub ucs22unicode {
228 my ($in) = @_;
229 my $out = [];
230
231 my $i = 0;
232 my $len = length ($in);
233 while ($i+1 < $len) {
234 push (@$out, ord (substr($in, $i, 1)) << 8 +
235 ord (substr($in, $i+1, 1)));
236
237 $i ++;
238 }
239
240 return $out;
241}
242
243# takes a reference to a string and returns a reference to a unicode array
244sub convert2unicode {
245 my ($encoding, $textref) = @_;
246
247 if (!defined $encodings::encodings->{$encoding}) {
248 print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
249 return [];
250 }
251
252 my $encodename = "$encoding-unicode";
253 my $enc_info = $encodings::encodings->{$encoding};
254 my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
255 "to_uc", $enc_info->{'mapfile'});
256 if (!&loadmapencoding ($encodename, $mapfile)) {
257 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
258 return [];
259 }
260
261 if (defined $enc_info->{'converter'}) {
262 my $converter = $enc_info->{'converter'};
263 return &$converter ($encodename, $textref);
264 }
265
266 if ($unicode::translations{$encodename}->{'count'} == 1) {
267 return &singlebyte2unicode ($encodename, $textref);
268 } else {
269 return &doublebyte2unicode ($encodename, $textref);
270 }
271}
272
273# singlebyte2unicode converts simple 8 bit encodings where characters below
274# 0x80 are normal ascii characters and the rest are decoded using the
275# appropriate mapping files.
276#
277# Examples of encodings that may be converted using singlebyte2unicode are
278# the iso-8859 and windows-125* series.
279sub singlebyte2unicode {
280 my ($encodename, $textref) = @_;
281
282 my @outtext = ();
283 my $len = length($$textref);
284 my ($c);
285 my $i = 0;
286
287 while ($i < $len) {
288 if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
289 # normal ascii character
290 push (@outtext, $c);
291 } else {
292 $c = &transchar ($encodename, $c);
293 # put a black square if cannot translate
294 $c = 0x25A1 if $c == 0;
295 push (@outtext, $c);
296 }
297 $i ++;
298 }
299 return \@outtext;
300}
301
302# doublebyte2unicode converts simple two byte encodings where characters
303# below code point 0x80 are single-byte characters and the rest are
304# double-byte characters.
305#
306# Examples of encodings that may be converted using doublebyte2unicode are
307# CJK encodings like GB encoded Chinese and UHC Korean.
308#
309# Note that no error checking is performed to make sure that the input text
310# is valid for the given encoding.
311#
312# Also, encodings that may contain characters of more than two bytes are
313# not supported (any EUC encoded text may in theory contain 3-byte
314# characters but in practice only one and two byte characters are used).
315sub doublebyte2unicode {
316 my ($encodename, $textref) = @_;
317
318 my @outtext = ();
319 my $len = length($$textref);
320 my ($c1, $c2);
321 my $i = 0;
322
323 while ($i < $len) {
324 if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
325 if ($i+1 < $len) {
326 # double-byte character
327 $c2 = ord(substr($$textref, $i+1, 1));
328 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
329 # put a black square if cannot translate
330 $c = 0x25A1 if $c == 0;
331 push (@outtext, $c);
332 $i += 2;
333
334 } else {
335 # error
336 print STDERR "unicode: ERROR missing second half of double-byte character\n";
337 $i++;
338 }
339
340 } else {
341 # single-byte character
342 push (@outtext, $c1);
343 $i++;
344 }
345 }
346 return \@outtext;
347}
348
349# Shift-JIS to unicode
350# We can't use doublebyte2unicode for Shift-JIS because it uses some
351# single-byte characters above code point 0x80 (i.e. half-width katakana
352# characters in the range 0xA1-0xDF)
353sub shiftjis2unicode {
354 my ($encodename, $textref) = @_;
355
356 my @outtext = ();
357 my $len = length($$textref);
358 my ($c1, $c2);
359 my $i = 0;
360
361 while ($i < $len) {
362 $c1 = ord(substr($$textref, $i, 1));
363
364 if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
365 # Single-byte half-width katakana character or
366 # JIS Roman yen or overline characters
367 my $c = &transchar ($encodename, $c1);
368 # - put a black square if cannot translate
369 $c = 0x25A1 if $c == 0;
370 push (@outtext, $c);
371 $i++;
372
373 } elsif ($c1 < 0x80) {
374 # ASCII
375 push (@outtext, $c1);
376 $i ++;
377
378 } elsif ($c1 < 0xEF) {
379 if ($i+1 < $len) {
380 $c2 = ord(substr($$textref, $i+1, 1));
381 if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
382 # Double-byte shift-jis character
383 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
384 # put a black square if cannot translate
385 $c = 0x25A1 if $c == 0;
386 push (@outtext, $c);
387 } else {
388 # error
389 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
390 }
391 $i += 2;
392 } else {
393 # error
394 print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
395 $i ++;
396 }
397 } else {
398 # error
399 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
400 $i ++;
401 }
402 }
403 return \@outtext;
404}
405
406sub transchar {
407 my ($encoding, $from) = @_;
408 my $high = ($from / 256) % 256;
409 my $low = $from % 256;
410
411 return 0 unless defined $unicode::translations{$encoding};
412
413 my $block = $unicode::translations{$encoding}->{'map'};
414
415 if (ref ($block->[$high]) ne "ARRAY") {
416 return 0;
417 }
418 return $block->[$high]->[$low];
419}
420
421# %translations is of the form:
422#
423# encodings{encodingname-encodingname}->{'map'}->blocktranslation
424# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
425#
426# Any of the top translation blocks can point to an undefined
427# value. This data structure aims to allow fast translation and
428# efficient storage.
429%unicode::translations = ();
430
431# @array256 is used for initialisation, there must be
432# a better way...
433# What about this?: @array256 = (0) x 256;
434@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
435 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
436 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
437 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
438 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
439 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
440 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
441 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
442 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
443 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
444 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
445 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
446 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
447 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
448 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
449 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
450
451# returns 1 if successful, 0 if unsuccessful
452sub loadmapencoding {
453 my ($encoding, $mapfile) = @_;
454
455 # check to see if the encoding has already been loaded
456 return 1 if (defined $unicode::translations{$encoding});
457
458 if (! -r $mapfile || -d $mapfile) {
459 return 0;
460 }
461 return 0 unless open (MAPFILE, $mapfile);
462 binmode (MAPFILE);
463
464 $unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
465 my $block = $unicode::translations{$encoding};
466
467 my ($in,$i,$j);
468 while (1) {
469 my $ret=read(MAPFILE, $in, 1);
470 if (!defined($ret)) { # error
471 print STDERR "unicode.pm: error reading mapfile: $!\n";
472 last;
473 }
474 if ($ret != 1) { last }
475 $i = unpack ("C", $in);
476 $block->{'map'}->[$i] = [@unicode::array256];
477 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
478 my ($n1, $n2) = unpack ("CC", $in);
479 $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
480 }
481 $block->{'count'} ++;
482 }
483
484 close (MAPFILE);
485}
486
487# unicode2singlebyte converts unicode to simple 8 bit encodings where
488# characters below 0x80 are normal ascii characters and the rest are encoded
489# using the appropriate mapping files.
490#
491# Examples of encodings that may be converted using unicode2singlebyte are
492# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
493sub unicode2singlebyte {
494 my ($uniref, $encoding) = @_;
495
496 my $outtext = "";
497 my $encodename = "unicode-$encoding";
498
499 if (!exists $encodings::encodings->{$encoding}) {
500 print STDERR "unicode.pm: ERROR - unsupported encoding "
501 . "'$encoding' requested\n";
502 return "";
503 }
504
505 my $enc_info = $encodings::encodings->{$encoding};
506 my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
507 "from_uc", $enc_info->{'mapfile'});
508 if (!&loadmapencoding ($encodename, $mapfile)) {
509 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
510 return "";
511 }
512
513 foreach my $c (@$uniref) {
514 if ($c < 0x80) {
515 # normal ascii character
516 $outtext .= chr($c);
517 } else {
518 # extended ascii character
519 $c = &transchar ($encodename, $c);
520
521 # put a question mark if cannot translate
522 if ($c == 0) {
523 $outtext .= "?";
524 } else {
525 $outtext .= chr($c);
526 }
527 }
528 }
529 return $outtext;
530}
531
532
533# this makes sure that the referenced input string is utf8 encoded, and
534# will change/remove bytes that aren't.
535# returns 0 if the text was already utf8, or 1 if text modified to become utf8
536sub ensure_utf8 {
537 my $stringref=shift;
538
539 if (!defined($stringref) || ref($stringref) ne 'SCALAR') {
540 return $stringref;
541 }
542
543 my $value=$$stringref;
544
545 my $non_utf8_found = 0;
546 $value =~ m/^/g; # to set \G
547 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
548 my $highbytes=$1;
549 my $highbyteslength=length($highbytes);
550 # make sure this block of high bytes is utf-8
551 $highbytes =~ /^/g; # set pos()
552 my $byte_replaced = 0;
553 while ($highbytes =~
554 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
555 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
556 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
557 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
558 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
559 )*([\x80-\xff])? !xg
560 ) {
561 # this highbyte is "out-of-place" for valid utf-8
562 my $badbyte=$1;
563 if (!defined $badbyte) {next} # hit end of string
564 my $pos=pos($highbytes);
565 # replace bad byte. assume iso-8859-1 -> utf-8
566 # ascii2utf8 does "extended ascii"... ie iso-8859-1
567 my $replacement=&unicode::ascii2utf8(\$badbyte);
568 substr($highbytes, $pos-1, 1, $replacement);
569 # update the position to continue searching (for \G)
570 pos($highbytes) = $pos+length($replacement)-1;
571 $byte_replaced = 1;
572 }
573 if ($byte_replaced) {
574 # replace this block of high bytes in the $value
575 $non_utf8_found = 1;
576 my $replength=length($highbytes); # we've changed the length
577 my $textpos=pos($value); # pos at end of last match
578 # replace bad bytes with good bytes
579 substr($value, $textpos-$highbyteslength,
580 $highbyteslength, $highbytes);
581 # update the position to continue searching (for \G)
582 pos($value)=$textpos+($replength-$highbyteslength)+1;
583 }
584 }
585
586 $$stringref = $value;
587 return $non_utf8_found;
588}
589
590# Returns true (1) if the given string is utf8 and false (0) if it isn't.
591# Does not modify the string parameter.
592sub check_is_utf8 {
593 my $value=shift;
594
595 if (!defined($value)) {
596 return 0; # not utf8 because it is undefined
597 }
598
599 $value =~ m/^/g; # to set \G
600 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
601 my $highbytes=$1;
602 # make sure this block of high bytes is utf-8
603 $highbytes =~ /^/g; # set pos()
604 while ($highbytes =~
605 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
606 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
607 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
608 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
609 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
610 )*([\x80-\xff])? !xg
611 ) {
612 my $badbyte=$1;
613 if (defined $badbyte) { # not end of string
614 return 0; # non-utf8 found
615 }
616 }
617 }
618
619 return 1;
620}
621
622sub url_encode {
623 my ($text) = @_;
624
625 if (!&is_url_encoded($text)) {
626 $text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
627 # return the url-encoded character entity for underscore back to the entity
628 $text =~ s/%26%23095%3B/&\#095;/g;
629 }
630 return $text;
631}
632
633sub url_decode {
634 my ($text,$and_numeric_entities) = @_;
635
636 if(defined $text) {
637 $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
638
639 if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
640 $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
641 $text =~ s/\&\#u?([0-9]+);/pack('C', $1)/ige;
642 }
643 }
644
645 return $text;
646}
647
648sub url_decode_utf8 {
649 my ($text,$and_numeric_entities) = @_;
650
651 $text =~ s/\%([0-9A-F]{2})/pack('b', hex($1))/ige;
652
653 $text = Encode::decode("utf8",$text);
654
655 return $text;
656}
657
658sub is_url_encoded {
659 my ($text) = @_;
660 return ($text =~ m/\%([0-9A-F]{2})/i) || ($text =~ m/\&\#x([0-9A-F]+;)/i) || ($text =~ m/\&\#([0-9]+;)/i);
661}
662
663# When a filename on the filesystem is already URL-encoded, the
664# URL to it will have %25s in place of every % sign, so that
665# URLs in html pages can refer to the URL-encoded filename.
666# This method changes the URL reference back into the actual
667# (URL-encoded) filename on the filesystem by replacing %25 with %.
668sub url_to_filename {
669 my ($text) =@_;
670 $text =~ s/%25/%/g if &is_url_encoded($text);
671 # DM safing would have replaced underscores with character entity &#095;
672 # in SourceFile meta. Undo any such change to get the filename referred to.
673 $text =~ s/&\#095;/_/g;
674 return $text;
675}
676
677# When a filename on the filesystem is already URL-encoded, the
678# URL to it will have %25s in place of every % sign, so that
679# URLs in html pages can refer to the URL-encoded filename.
680# Given a (URL-encoded) filename on the filesystem, this subroutine
681# returns the URL reference string for it by replacing % with %25.
682# The output string will be the same as the input string if the input
683# already contains one or more %25s. This is to prevent processing
684# a url more than once this way.
685sub filename_to_url {
686 my ($text) = @_;
687
688 if($text !~ m/%25/) {
689 $text =~ s/%/%25/g;
690 }
691 return $text;
692}
693
694sub base64_encode {
695 my ($text) = @_;
696 if(!&conforms_to_mod_base64($text)) {
697 # return entity for underscore to underscore before encoding
698 $text =~ s/&\#095;/_/g;
699
700 $text = &MIME::Base64::encode_base64($text);
701 # base64 encoding may introduce + and / signs,
702 # replacing them with - and _ to ensure it's filename-safe
703 $text =~ s/\+/\-/g; # + -> -
704 $text =~ s/\//\_/g; # / -> _
705 }
706 return $text;
707}
708
709# If the input fits the modified base64 pattern, this will try decoding it.
710# Still, this method does not guarantee the return value is the 'original', only
711# that the result is where the base64 decoding process has been applied once.
712# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
713sub base64_decode {
714 my ($text) = @_;
715 if(&conforms_to_mod_base64($text)) {
716 # base64 encodes certain chars with + and /, but if we'd encoded it, we'd
717 # have replaced them with - and _ respectively. Undo this before decoding.
718 $text =~ s/\-/\+/g; # - -> +
719 $text =~ s/\_/\//g; # _ -> /
720 $text = &MIME::Base64::decode_base64($text);
721 }
722 return $text;
723}
724
725# Returns true if the given string is compatible with a modified version
726# of base64 (where the + and / are replaced with - and _), a format which
727# includes also regular ASCII alphanumeric values. This method does not
728# guarantee that the given string is actually base64 encoded, since it will
729# return true for any simple alphanumeric ASCII string as well.
730sub conforms_to_mod_base64 {
731 my ($text) = @_;
732
733 # need to treat the entity ref for underscore as underscore
734 $text =~ s/&\#095;/_/g;
735
736 # base 64 takes alphanumeric and [=+/],
737 # but we use modified base64 where + and / are replaced with - and _
738 return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
739}
740
741sub substr
742{
743 my ($utf8_string, $offset, $length) = @_;
744
745 my @unicode_string = @{&utf82unicode($utf8_string)};
746 my $unicode_string_length = scalar(@unicode_string);
747
748 my $substr_start = $offset;
749 if ($substr_start >= $unicode_string_length) {
750 return "";
751 }
752
753 my $substr_end = $offset + $length - 1;
754 if ($substr_end >= $unicode_string_length) {
755 $substr_end = $unicode_string_length - 1;
756 }
757
758 my @unicode_substring = @unicode_string[$substr_start..$substr_end];
759 return &unicode2utf8(\@unicode_substring);
760}
761
762# Useful method to print UTF8 (or other unicode) for debugging.
763# Characters that are easily displayed (that is, printable ASCII)
764# are shown as-is, whereas hex values of the unicode code points
765# are shown for all other chars.
766sub debug_unicode_string
767{
768 join("",
769 map { $_ > 127 ? # if wide character...
770 sprintf("\\x{%04X}", $_) : # \x{...}
771 chr($_)
772 } unpack("U*", $_[0])); # unpack Unicode characters
773}
774
775
776sub raw_filename_to_url_encoded
777{
778 my ($str_in) = @_;
779
780 my @url_encoded_chars
781 = map { $_ > 255 ? # Needs to be represent in entity form
782 sprintf("&#x%X;",$_) :
783 $_>127 || $_==ord("%") ? # Representable in %XX form
784 sprintf("%%%2X", $_) :
785 chr($_) # otherwise, Ascii char
786 } unpack("U*", $str_in); # Unpack Unicode characters
787
788
789 my $str_out = join("", @url_encoded_chars);
790
791 return $str_out;
792
793}
794
795sub url_encoded_to_raw_filename
796{
797 my ($str_in) = @_;
798
799 my $str_out = $str_in;
800
801 $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
802 $str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
803 $str_out =~ s/&#([0-9]+);/chr($1)/eig;
804
805 return $str_out;
806}
807
808
809sub raw_filename_to_utf8_url_encoded
810{
811 my ($str_in) = @_;
812
813 $str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
814
815 my @url_encoded_chars
816 = map { $_ > 127 ? # Representable in %XX form
817 sprintf("%%%2X", $_) :
818 chr($_) # otherwise, Ascii char
819 } unpack("U*", $str_in); # Unpack utf8 characters
820
821
822 my $str_out = join("", @url_encoded_chars);
823
824 return $str_out;
825
826}
827
828sub utf8_url_encoded_to_raw_filename
829{
830 my ($str_in) = @_;
831
832 my $utf8_str_out = $str_in;
833
834 $utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
835
836 my $unicode_str_out = decode("utf8",$utf8_str_out);
837 my $raw_str_out = utf8::downgrade($unicode_str_out);
838
839 return $raw_str_out;
840}
841
842sub analyze_raw_string
843{
844 my ($str_in) = @_;
845
846 my $uses_bytecodes = 0;
847 my $exceeds_bytecodes = 0;
848
849 map { $exceeds_bytecodes = 1 if ($_ >= 256);
850 $uses_bytecodes = 1 if (($_ >= 128) && ($_ < 256));
851 } unpack("U*", $str_in); # Unpack Unicode characters
852
853 return ($uses_bytecodes,$exceeds_bytecodes);
854}
855
856
857sub convert_utf8_string_to_unicode_string
858{
859 my $utf8_string = shift(@_);
860
861 my $unicode_string = "";
862 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
863 $unicode_string .= chr($unicode_value);
864 }
865 return $unicode_string;
866}
867
868sub convert_unicode_string_to_utf8_string
869{
870 my $unicode_string = shift(@_);
871
872 my @unicode_array;
873 for (my $i = 0; $i < length($unicode_string); $i++) {
874 push(@unicode_array, ord(&substr($unicode_string, $i, 1)));
875 }
876 return &unicode::unicode2utf8(\@unicode_array);
877}
878
879
8801;
Note: See TracBrowser for help on using the repository browser.