source: main/trunk/greenstone2/perllib/unicode.pm@ 23387

Last change on this file since 23387 was 23387, checked in by davidb, 13 years ago

Further changes to deal with documents that use different filename encodings on the file-system. Now sets UTF8URL metadata to perform the cross-document look up. Files stored in doc.pm as associated files are now always raw filenames (rather than potentially UTF8 encoded). Storing of filenames seen by HTMLPlug when scanning for files to block on is now done in Unicode aware strings rather than utf8 but unware strings.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 22.7 KB
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2004 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33eval {require bytes};
34
35use encodings;
36use strict;
37use util;
38use MIME::Base64; # for base64 encoding
39
40no strict 'refs';
41
42
43
44sub utf8decomp
45{
46 my ($str) = @_;
47
48 return if (!defined $str);
49 return "" if ($str eq "");
50
51 my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
52
53 my @each_char
54 = map { ($_ > 255 )
55 ? # if wide character...
56 sprintf("\\x{%04X}", $_)
57 : # \x{...}
58 (chr($_) =~ m/[[:cntrl:]]/ )
59 ? # else if control character ...
60 sprintf("\\x%02X", $_)
61 : # \x..
62 quotemeta(chr($_)) # else quoted or as themselves
63 } @unpacked_chars;
64
65 return join("",@each_char);
66}
67
68
69sub hex_codepoint {
70 if (my $char = shift) {
71 return sprintf '%2.2x', unpack('U0U*', $char);
72 }
73}
74
75
76
77
78# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
79# and returns a unicode array.
80sub ascii2unicode {
81 my ($in) = @_;
82 my $out = [];
83
84 my $i = 0;
85 my $len = length($in);
86 while ($i < $len) {
87 push (@$out, ord(substr ($in, $i, 1)));
88 $i++;
89 }
90
91 return $out;
92}
93
94# ascii2utf8 takes a reference to an (extended) ascii string and returns a
95# UTF-8 encoded string. This is just a faster version of
96# "&unicode2utf8(&ascii2unicode($str));"
97# "Extended ascii" really means "iso_8859_1"
98sub ascii2utf8 {
99 my ($in) = @_;
100 my $out = "";
101
102 if (!defined($in)|| !defined($$in)) {
103 return $out;
104 }
105
106 my ($c);
107 my $i = 0;
108 my $len = length($$in);
109 while ($i < $len) {
110 $c = ord (substr ($$in, $i, 1));
111 if ($c < 0x80) {
112 # ascii character
113 $out .= chr ($c);
114
115 } else {
116 # extended ascii character
117 $out .= chr (0xc0 + (($c >> 6) & 0x1f));
118 $out .= chr (0x80 + ($c & 0x3f));
119 }
120 $i++;
121 }
122
123 return $out;
124}
125
126# unicode2utf8 takes a unicode array as input and encodes it
127# using utf-8
128sub unicode2utf8 {
129 my ($in) = @_;
130 my $out = "";
131
132 foreach my $num (@$in) {
133 next unless defined $num;
134 if ($num < 0x80) {
135 $out .= chr ($num);
136
137 } elsif ($num < 0x800) {
138 $out .= chr (0xc0 + (($num >> 6) & 0x1f));
139 $out .= chr (0x80 + ($num & 0x3f));
140
141 } elsif ($num < 0xFFFF) {
142 $out .= chr (0xe0 + (($num >> 12) & 0xf));
143 $out .= chr (0x80 + (($num >> 6) & 0x3f));
144 $out .= chr (0x80 + ($num & 0x3f));
145
146 } else {
147 # error, don't encode anything
148 die;
149 }
150 }
151 return $out;
152}
153
154# utf82unicode takes a utf-8 string and produces a unicode
155# array
156sub utf82unicode {
157 my ($in) = @_;
158 my $out = [];
159
160 my $i = 0;
161 my ($c1, $c2, $c3);
162 my $len = length($in);
163 while ($i < $len) {
164 if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
165 # normal ascii character
166 push (@$out, $c1);
167
168 } elsif ($c1 < 0xc0) {
169 # error, was expecting the first byte of an
170 # encoded character. Do nothing.
171
172 } elsif ($c1 < 0xe0 && $i+1 < $len) {
173 # an encoded character with two bytes
174 $c2 = ord (substr ($in, $i+1, 1));
175 if ($c2 >= 0x80 && $c2 < 0xc0) {
176 # everything looks ok
177 push (@$out, ((($c1 & 0x1f) << 6) +
178 ($c2 & 0x3f)));
179 $i++; # gobbled an extra byte
180 }
181
182 } elsif ($c1 < 0xf0 && $i+2 < $len) {
183 # an encoded character with three bytes
184 $c2 = ord (substr ($in, $i+1, 1));
185 $c3 = ord (substr ($in, $i+2, 1));
186 if ($c2 >= 0x80 && $c2 < 0xc0 &&
187 $c3 >= 0x80 && $c3 < 0xc0) {
188 # everything looks ok
189 push (@$out, ((($c1 & 0xf) << 12) +
190 (($c2 & 0x3f) << 6) +
191 ($c3 & 0x3f)));
192
193 $i += 2; # gobbled an extra two bytes
194 }
195
196 } else {
197 # error, only decode Unicode characters not full UCS.
198 # Do nothing.
199 }
200
201 $i++;
202 }
203
204 return $out;
205}
206
207# unicode2ucs2 takes a unicode array and produces a UCS-2
208# unicode string (every two bytes forms a unicode character)
209sub unicode2ucs2 {
210 my ($in) = @_;
211 my $out = "";
212
213 foreach my $num (@$in) {
214 $out .= chr (($num & 0xff00) >> 8);
215 $out .= chr ($num & 0xff);
216 }
217
218 return $out;
219}
220
221# ucs22unicode takes a UCS-2 string and produces a unicode array
222sub ucs22unicode {
223 my ($in) = @_;
224 my $out = [];
225
226 my $i = 0;
227 my $len = length ($in);
228 while ($i+1 < $len) {
229 push (@$out, ord (substr($in, $i, 1)) << 8 +
230 ord (substr($in, $i+1, 1)));
231
232 $i ++;
233 }
234
235 return $out;
236}
237
238# takes a reference to a string and returns a reference to a unicode array
239sub convert2unicode {
240 my ($encoding, $textref) = @_;
241
242 if (!defined $encodings::encodings->{$encoding}) {
243 print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
244 return [];
245 }
246
247 my $encodename = "$encoding-unicode";
248 my $enc_info = $encodings::encodings->{$encoding};
249 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
250 "to_uc", $enc_info->{'mapfile'});
251 if (!&loadmapencoding ($encodename, $mapfile)) {
252 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
253 return [];
254 }
255
256 if (defined $enc_info->{'converter'}) {
257 my $converter = $enc_info->{'converter'};
258 return &$converter ($encodename, $textref);
259 }
260
261 if ($unicode::translations{$encodename}->{'count'} == 1) {
262 return &singlebyte2unicode ($encodename, $textref);
263 } else {
264 return &doublebyte2unicode ($encodename, $textref);
265 }
266}
267
268# singlebyte2unicode converts simple 8 bit encodings where characters below
269# 0x80 are normal ascii characters and the rest are decoded using the
270# appropriate mapping files.
271#
272# Examples of encodings that may be converted using singlebyte2unicode are
273# the iso-8859 and windows-125* series.
274sub singlebyte2unicode {
275 my ($encodename, $textref) = @_;
276
277 my @outtext = ();
278 my $len = length($$textref);
279 my ($c);
280 my $i = 0;
281
282 while ($i < $len) {
283 if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
284 # normal ascii character
285 push (@outtext, $c);
286 } else {
287 $c = &transchar ($encodename, $c);
288 # put a black square if cannot translate
289 $c = 0x25A1 if $c == 0;
290 push (@outtext, $c);
291 }
292 $i ++;
293 }
294 return \@outtext;
295}
296
297# doublebyte2unicode converts simple two byte encodings where characters
298# below code point 0x80 are single-byte characters and the rest are
299# double-byte characters.
300#
301# Examples of encodings that may be converted using doublebyte2unicode are
302# CJK encodings like GB encoded Chinese and UHC Korean.
303#
304# Note that no error checking is performed to make sure that the input text
305# is valid for the given encoding.
306#
307# Also, encodings that may contain characters of more than two bytes are
308# not supported (any EUC encoded text may in theory contain 3-byte
309# characters but in practice only one and two byte characters are used).
310sub doublebyte2unicode {
311 my ($encodename, $textref) = @_;
312
313 my @outtext = ();
314 my $len = length($$textref);
315 my ($c1, $c2);
316 my $i = 0;
317
318 while ($i < $len) {
319 if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
320 if ($i+1 < $len) {
321 # double-byte character
322 $c2 = ord(substr($$textref, $i+1, 1));
323 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
324 # put a black square if cannot translate
325 $c = 0x25A1 if $c == 0;
326 push (@outtext, $c);
327 $i += 2;
328
329 } else {
330 # error
331 print STDERR "unicode: ERROR missing second half of double-byte character\n";
332 $i++;
333 }
334
335 } else {
336 # single-byte character
337 push (@outtext, $c1);
338 $i++;
339 }
340 }
341 return \@outtext;
342}
343
344# Shift-JIS to unicode
345# We can't use doublebyte2unicode for Shift-JIS because it uses some
346# single-byte characters above code point 0x80 (i.e. half-width katakana
347# characters in the range 0xA1-0xDF)
348sub shiftjis2unicode {
349 my ($encodename, $textref) = @_;
350
351 my @outtext = ();
352 my $len = length($$textref);
353 my ($c1, $c2);
354 my $i = 0;
355
356 while ($i < $len) {
357 $c1 = ord(substr($$textref, $i, 1));
358
359 if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
360 # Single-byte half-width katakana character or
361 # JIS Roman yen or overline characters
362 my $c = &transchar ($encodename, $c1);
363 # - put a black square if cannot translate
364 $c = 0x25A1 if $c == 0;
365 push (@outtext, $c);
366 $i++;
367
368 } elsif ($c1 < 0x80) {
369 # ASCII
370 push (@outtext, $c1);
371 $i ++;
372
373 } elsif ($c1 < 0xEF) {
374 if ($i+1 < $len) {
375 $c2 = ord(substr($$textref, $i+1, 1));
376 if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
377 # Double-byte shift-jis character
378 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
379 # put a black square if cannot translate
380 $c = 0x25A1 if $c == 0;
381 push (@outtext, $c);
382 } else {
383 # error
384 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
385 }
386 $i += 2;
387 } else {
388 # error
389 print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
390 $i ++;
391 }
392 } else {
393 # error
394 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
395 $i ++;
396 }
397 }
398 return \@outtext;
399}
400
401sub transchar {
402 my ($encoding, $from) = @_;
403 my $high = ($from / 256) % 256;
404 my $low = $from % 256;
405
406 return 0 unless defined $unicode::translations{$encoding};
407
408 my $block = $unicode::translations{$encoding}->{'map'};
409
410 if (ref ($block->[$high]) ne "ARRAY") {
411 return 0;
412 }
413 return $block->[$high]->[$low];
414}
415
416# %translations is of the form:
417#
418# encodings{encodingname-encodingname}->{'map'}->blocktranslation
419# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
420#
421# Any of the top translation blocks can point to an undefined
422# value. This data structure aims to allow fast translation and
423# efficient storage.
424%unicode::translations = ();
425
426# @array256 is used for initialisation, there must be
427# a better way...
428# What about this?: @array256 = (0) x 256;
429@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
430 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
431 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
432 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
433 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
434 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
435 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
436 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
437 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
438 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
439 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
440 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
441 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
442 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
443 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
444 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
445
446# returns 1 if successful, 0 if unsuccessful
447sub loadmapencoding {
448 my ($encoding, $mapfile) = @_;
449
450 # check to see if the encoding has already been loaded
451 return 1 if (defined $unicode::translations{$encoding});
452
453 if (! -r $mapfile || -d $mapfile) {
454 return 0;
455 }
456 return 0 unless open (MAPFILE, $mapfile);
457 binmode (MAPFILE);
458
459 $unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
460 my $block = $unicode::translations{$encoding};
461
462 my ($in,$i,$j);
463 while (1) {
464 my $ret=read(MAPFILE, $in, 1);
465 if (!defined($ret)) { # error
466 print STDERR "unicode.pm: error reading mapfile: $!\n";
467 last;
468 }
469 if ($ret != 1) { last }
470 $i = unpack ("C", $in);
471 $block->{'map'}->[$i] = [@unicode::array256];
472 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
473 my ($n1, $n2) = unpack ("CC", $in);
474 $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
475 }
476 $block->{'count'} ++;
477 }
478
479 close (MAPFILE);
480}
481
482# unicode2singlebyte converts unicode to simple 8 bit encodings where
483# characters below 0x80 are normal ascii characters and the rest are encoded
484# using the appropriate mapping files.
485#
486# Examples of encodings that may be converted using unicode2singlebyte are
487# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
488sub unicode2singlebyte {
489 my ($uniref, $encoding) = @_;
490
491 my $outtext = "";
492 my $encodename = "unicode-$encoding";
493
494 if (!exists $encodings::encodings->{$encoding}) {
495 print STDERR "unicode.pm: ERROR - unsupported encoding "
496 . "'$encoding' requested\n";
497 return "";
498 }
499
500 my $enc_info = $encodings::encodings->{$encoding};
501 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
502 "from_uc", $enc_info->{'mapfile'});
503 if (!&loadmapencoding ($encodename, $mapfile)) {
504 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
505 return "";
506 }
507
508 foreach my $c (@$uniref) {
509 if ($c < 0x80) {
510 # normal ascii character
511 $outtext .= chr($c);
512 } else {
513 # extended ascii character
514 $c = &transchar ($encodename, $c);
515
516 # put a question mark if cannot translate
517 if ($c == 0) {
518 $outtext .= "?";
519 } else {
520 $outtext .= chr($c);
521 }
522 }
523 }
524 return $outtext;
525}
526
527
528# this makes sure that the referenced input string is utf8 encoded, and
529# will change/remove bytes that aren't.
530# returns 0 if the text was already utf8, or 1 if text modified to become utf8
531sub ensure_utf8 {
532 my $stringref=shift;
533
534 if (!defined($stringref) || ref($stringref) ne 'SCALAR') {
535 return $stringref;
536 }
537
538 my $value=$$stringref;
539
540 my $non_utf8_found = 0;
541 $value =~ m/^/g; # to set \G
542 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
543 my $highbytes=$1;
544 my $highbyteslength=length($highbytes);
545 # make sure this block of high bytes is utf-8
546 $highbytes =~ /^/g; # set pos()
547 my $byte_replaced = 0;
548 while ($highbytes =~
549 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
550 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
551 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
552 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
553 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
554 )*([\x80-\xff])? !xg
555 ) {
556 # this highbyte is "out-of-place" for valid utf-8
557 my $badbyte=$1;
558 if (!defined $badbyte) {next} # hit end of string
559 my $pos=pos($highbytes);
560 # replace bad byte. assume iso-8859-1 -> utf-8
561 # ascii2utf8 does "extended ascii"... ie iso-8859-1
562 my $replacement=&unicode::ascii2utf8(\$badbyte);
563 substr($highbytes, $pos-1, 1, $replacement);
564 # update the position to continue searching (for \G)
565 pos($highbytes) = $pos+length($replacement)-1;
566 $byte_replaced = 1;
567 }
568 if ($byte_replaced) {
569 # replace this block of high bytes in the $value
570 $non_utf8_found = 1;
571 my $replength=length($highbytes); # we've changed the length
572 my $textpos=pos($value); # pos at end of last match
573 # replace bad bytes with good bytes
574 substr($value, $textpos-$highbyteslength,
575 $highbyteslength, $highbytes);
576 # update the position to continue searching (for \G)
577 pos($value)=$textpos+($replength-$highbyteslength)+1;
578 }
579 }
580
581 $$stringref = $value;
582 return $non_utf8_found;
583}
584
585# Returns true (1) if the given string is utf8 and false (0) if it isn't.
586# Does not modify the string parameter.
587sub check_is_utf8 {
588 my $value=shift;
589
590 if (!defined($value)) {
591 return 0; # not utf8 because it is undefined
592 }
593
594 $value =~ m/^/g; # to set \G
595 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
596 my $highbytes=$1;
597 # make sure this block of high bytes is utf-8
598 $highbytes =~ /^/g; # set pos()
599 while ($highbytes =~
600 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
601 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
602 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
603 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
604 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
605 )*([\x80-\xff])? !xg
606 ) {
607 my $badbyte=$1;
608 if (defined $badbyte) { # not end of string
609 return 0; # non-utf8 found
610 }
611 }
612 }
613
614 return 1;
615}
616
617sub url_encode {
618 my ($text) = @_;
619
620 if (!&is_url_encoded($text)) {
621 $text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
622 # return the url-encoded character entity for underscore back to the entity
623 $text =~ s/%26%23095%3B/&\#095;/g;
624 }
625 return $text;
626}
627
628sub url_decode {
629 my ($text,$and_numeric_entities) = @_;
630
631 $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
632
633 if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
634 $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
635 $text =~ s/\&\#([0-9]+);/pack('C', $1)/ige;
636 }
637
638 return $text;
639}
640
641sub is_url_encoded {
642 my ($text) = @_;
643 return ($text =~ m/\%([0-9A-F]{2})/i) || ($text =~ m/\&\#x([0-9A-F]+;)/i) || ($text =~ m/\&\#([0-9]+;)/i);
644}
645
646# When a filename on the filesystem is already URL-encoded, the
647# URL to it will have %25s in place of every % sign, so that
648# URLs in html pages can refer to the URL-encoded filename.
649# This method changes the URL reference back into the actual
650# (URL-encoded) filename on the filesystem by replacing %25 with %.
651sub url_to_filename {
652 my ($text) =@_;
653 $text =~ s/%25/%/g if &is_url_encoded($text);
654 # DM safing would have replaced underscores with character entity &#095;
655 # in SourceFile meta. Undo any such change to get the filename referred to.
656 $text =~ s/&\#095;/_/g;
657 return $text;
658}
659
660# When a filename on the filesystem is already URL-encoded, the
661# URL to it will have %25s in place of every % sign, so that
662# URLs in html pages can refer to the URL-encoded filename.
663# Given a (URL-encoded) filename on the filesystem, this subroutine
664# returns the URL reference string for it by replacing % with %25.
665# The output string will be the same as the input string if the input
666# already contains one or more %25s. This is to prevent processing
667# a url more than once this way.
668sub filename_to_url {
669 my ($text) = @_;
670
671 if($text !~ m/%25/) {
672 $text =~ s/%/%25/g;
673 }
674 return $text;
675}
676
677sub base64_encode {
678 my ($text) = @_;
679 if(!&conforms_to_mod_base64($text)) {
680 # return entity for underscore to underscore before encoding
681 $text =~ s/&\#095;/_/g;
682
683 $text = &MIME::Base64::encode_base64($text);
684 # base64 encoding may introduce + and / signs,
685 # replacing them with - and _ to ensure it's filename-safe
686 $text =~ s/\+/\-/g; # + -> -
687 $text =~ s/\//\_/g; # / -> _
688 }
689 return $text;
690}
691
692# If the input fits the modified base64 pattern, this will try decoding it.
693# Still, this method does not guarantee the return value is the 'original', only
694# that the result is where the base64 decoding process has been applied once.
695# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
696sub base64_decode {
697 my ($text) = @_;
698 if(&conforms_to_mod_base64($text)) {
699 # base64 encodes certain chars with + and /, but if we'd encoded it, we'd
700 # have replaced them with - and _ respectively. Undo this before decoding.
701 $text =~ s/\-/\+/g; # - -> +
702 $text =~ s/\_/\//g; # _ -> /
703 $text = &MIME::Base64::decode_base64($text);
704 }
705 return $text;
706}
707
708# Returns true if the given string is compatible with a modified version
709# of base64 (where the + and / are replaced with - and _), a format which
710# includes also regular ASCII alphanumeric values. This method does not
711# guarantee that the given string is actually base64 encoded, since it will
712# return true for any simple alphanumeric ASCII string as well.
713sub conforms_to_mod_base64 {
714 my ($text) = @_;
715
716 # need to treat the entity ref for underscore as underscore
717 $text =~ s/&\#095;/_/g;
718
719 # base 64 takes alphanumeric and [=+/],
720 # but we use modified base64 where + and / are replaced with - and _
721 return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
722}
723
724sub substr
725{
726 my ($utf8_string, $offset, $length) = @_;
727
728 my @unicode_string = @{&utf82unicode($utf8_string)};
729 my $unicode_string_length = scalar(@unicode_string);
730
731 my $substr_start = $offset;
732 if ($substr_start >= $unicode_string_length) {
733 return "";
734 }
735
736 my $substr_end = $offset + $length - 1;
737 if ($substr_end >= $unicode_string_length) {
738 $substr_end = $unicode_string_length - 1;
739 }
740
741 my @unicode_substring = @unicode_string[$substr_start..$substr_end];
742 return &unicode2utf8(\@unicode_substring);
743}
744
745# Useful method to print UTF8 (or other unicode) for debugging.
746# Characters that are easily displayed (that is, printable ASCII)
747# are shown as-is, whereas hex values of the unicode code points
748# are shown for all other chars.
749sub debug_unicode_string
750{
751 join("",
752 map { $_ > 128 ? # if wide character...
753 sprintf("\\x{%04X}", $_) : # \x{...}
754 chr($_)
755 } unpack("U*", $_[0])); # unpack Unicode characters
756}
757
758
759sub raw_filename_to_url_encoded
760{
761 my ($str_in) = @_;
762
763 my @url_encoded_chars
764 = map { $_ > 255 ? # Needs to be represent in entity form
765 sprintf("&#x%X;",$_) :
766 $_ > 128 ? # Representable in %XX form
767 sprintf("%%%2X", $_) :
768 chr($_) # otherwise, Ascii char
769 } unpack("U*", $str_in); # Unpack Unicode characters
770
771
772 my $str_out = join("", @url_encoded_chars);
773
774 return $str_out;
775
776}
777
778sub url_encoded_to_raw_filename
779{
780 my ($str_in) = @_;
781
782 my $str_out = $str_in;
783
784 $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
785 $str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
786 $str_out =~ s/&#([0-9]+);/chr($1)/eig;
787
788 return $str_out;
789}
790
791
792sub raw_filename_to_utf8_url_encoded
793{
794 my ($str_in) = @_;
795
796 $str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
797
798 my @url_encoded_chars
799 = map { $_ > 128 ? # Representable in %XX form
800 sprintf("%%%2X", $_) :
801 chr($_) # otherwise, Ascii char
802 } unpack("U*", $str_in); # Unpack utf8 characters
803
804
805 my $str_out = join("", @url_encoded_chars);
806
807 return $str_out;
808
809}
810
811sub utf8_url_encoded_to_raw_filename
812{
813 my ($str_in) = @_;
814
815 my $utf8_str_out = $str_in;
816
817 $utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
818
819 my $unicode_str_out = decode("utf8",$utf8_str_out);
820 my $raw_str_out = utf8::downgrade($unicode_str_out);
821
822 return $raw_str_out;
823}
824
825sub analyze_raw_string
826{
827 my ($str_in) = @_;
828
829 my $uses_bytecodes = 0;
830 my $exceeds_bytecodes = 0;
831
832 map { $exceeds_bytecodes = 1 if ($_ >= 256);
833 $uses_bytecodes = 1 if (($_ >= 128) && ($_ < 256));
834 } unpack("U*", $str_in); # Unpack Unicode characters
835
836 return ($uses_bytecodes,$exceeds_bytecodes);
837}
838
839
8401;
Note: See TracBrowser for help on using the repository browser.