source: main/trunk/greenstone2/perllib/unicode.pm@ 23362

Last change on this file since 23362 was 23362, checked in by davidb, 13 years ago

Additional routines (and few upgraded) to help support Greenstone working with filenames under Windows when then go beyond Latin-1 and start turning up in their DOS abbreviated form (e.g. Test~1.txt)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 21.5 KB
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2004 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33eval {require bytes};
34
35use encodings;
36use strict;
37use util;
38use MIME::Base64; # for base64 encoding
39
40no strict 'refs';
41
42
43
44sub utf8decomp
45{
46 my ($str) = @_;
47
48 return if (!defined $str);
49 return "" if ($str eq "");
50
51 my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
52
53 my @each_char
54 = map { ($_ > 255 )
55 ? # if wide character...
56 sprintf("\\x{%04X}", $_)
57 : # \x{...}
58 (chr($_) =~ m/[[:cntrl:]]/ )
59 ? # else if control character ...
60 sprintf("\\x%02X", $_)
61 : # \x..
62 quotemeta(chr($_)) # else quoted or as themselves
63 } @unpacked_chars;
64
65 return join("",@each_char);
66}
67
68
69sub hex_codepoint {
70 if (my $char = shift) {
71 return sprintf '%2.2x', unpack('U0U*', $char);
72 }
73}
74
75
76
77
78# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
79# and returns a unicode array.
80sub ascii2unicode {
81 my ($in) = @_;
82 my $out = [];
83
84 my $i = 0;
85 my $len = length($in);
86 while ($i < $len) {
87 push (@$out, ord(substr ($in, $i, 1)));
88 $i++;
89 }
90
91 return $out;
92}
93
94# ascii2utf8 takes a reference to an (extended) ascii string and returns a
95# UTF-8 encoded string. This is just a faster version of
96# "&unicode2utf8(&ascii2unicode($str));"
97# "Extended ascii" really means "iso_8859_1"
98sub ascii2utf8 {
99 my ($in) = @_;
100 my $out = "";
101
102 if (!defined($in)|| !defined($$in)) {
103 return $out;
104 }
105
106 my ($c);
107 my $i = 0;
108 my $len = length($$in);
109 while ($i < $len) {
110 $c = ord (substr ($$in, $i, 1));
111 if ($c < 0x80) {
112 # ascii character
113 $out .= chr ($c);
114
115 } else {
116 # extended ascii character
117 $out .= chr (0xc0 + (($c >> 6) & 0x1f));
118 $out .= chr (0x80 + ($c & 0x3f));
119 }
120 $i++;
121 }
122
123 return $out;
124}
125
126# unicode2utf8 takes a unicode array as input and encodes it
127# using utf-8
128sub unicode2utf8 {
129 my ($in) = @_;
130 my $out = "";
131
132 foreach my $num (@$in) {
133 next unless defined $num;
134 if ($num < 0x80) {
135 $out .= chr ($num);
136
137 } elsif ($num < 0x800) {
138 $out .= chr (0xc0 + (($num >> 6) & 0x1f));
139 $out .= chr (0x80 + ($num & 0x3f));
140
141 } elsif ($num < 0xFFFF) {
142 $out .= chr (0xe0 + (($num >> 12) & 0xf));
143 $out .= chr (0x80 + (($num >> 6) & 0x3f));
144 $out .= chr (0x80 + ($num & 0x3f));
145
146 } else {
147 # error, don't encode anything
148 die;
149 }
150 }
151 return $out;
152}
153
154# utf82unicode takes a utf-8 string and produces a unicode
155# array
156sub utf82unicode {
157 my ($in) = @_;
158 my $out = [];
159
160 my $i = 0;
161 my ($c1, $c2, $c3);
162 my $len = length($in);
163 while ($i < $len) {
164 if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
165 # normal ascii character
166 push (@$out, $c1);
167
168 } elsif ($c1 < 0xc0) {
169 # error, was expecting the first byte of an
170 # encoded character. Do nothing.
171
172 } elsif ($c1 < 0xe0 && $i+1 < $len) {
173 # an encoded character with two bytes
174 $c2 = ord (substr ($in, $i+1, 1));
175 if ($c2 >= 0x80 && $c2 < 0xc0) {
176 # everything looks ok
177 push (@$out, ((($c1 & 0x1f) << 6) +
178 ($c2 & 0x3f)));
179 $i++; # gobbled an extra byte
180 }
181
182 } elsif ($c1 < 0xf0 && $i+2 < $len) {
183 # an encoded character with three bytes
184 $c2 = ord (substr ($in, $i+1, 1));
185 $c3 = ord (substr ($in, $i+2, 1));
186 if ($c2 >= 0x80 && $c2 < 0xc0 &&
187 $c3 >= 0x80 && $c3 < 0xc0) {
188 # everything looks ok
189 push (@$out, ((($c1 & 0xf) << 12) +
190 (($c2 & 0x3f) << 6) +
191 ($c3 & 0x3f)));
192
193 $i += 2; # gobbled an extra two bytes
194 }
195
196 } else {
197 # error, only decode Unicode characters not full UCS.
198 # Do nothing.
199 }
200
201 $i++;
202 }
203
204 return $out;
205}
206
207# unicode2ucs2 takes a unicode array and produces a UCS-2
208# unicode string (every two bytes forms a unicode character)
209sub unicode2ucs2 {
210 my ($in) = @_;
211 my $out = "";
212
213 foreach my $num (@$in) {
214 $out .= chr (($num & 0xff00) >> 8);
215 $out .= chr ($num & 0xff);
216 }
217
218 return $out;
219}
220
221# ucs22unicode takes a UCS-2 string and produces a unicode array
222sub ucs22unicode {
223 my ($in) = @_;
224 my $out = [];
225
226 my $i = 0;
227 my $len = length ($in);
228 while ($i+1 < $len) {
229 push (@$out, ord (substr($in, $i, 1)) << 8 +
230 ord (substr($in, $i+1, 1)));
231
232 $i ++;
233 }
234
235 return $out;
236}
237
238# takes a reference to a string and returns a reference to a unicode array
239sub convert2unicode {
240 my ($encoding, $textref) = @_;
241
242 if (!defined $encodings::encodings->{$encoding}) {
243 print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
244 return [];
245 }
246
247 my $encodename = "$encoding-unicode";
248 my $enc_info = $encodings::encodings->{$encoding};
249 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
250 "to_uc", $enc_info->{'mapfile'});
251 if (!&loadmapencoding ($encodename, $mapfile)) {
252 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
253 return [];
254 }
255
256 if (defined $enc_info->{'converter'}) {
257 my $converter = $enc_info->{'converter'};
258 return &$converter ($encodename, $textref);
259 }
260
261 if ($unicode::translations{$encodename}->{'count'} == 1) {
262 return &singlebyte2unicode ($encodename, $textref);
263 } else {
264 return &doublebyte2unicode ($encodename, $textref);
265 }
266}
267
268# singlebyte2unicode converts simple 8 bit encodings where characters below
269# 0x80 are normal ascii characters and the rest are decoded using the
270# appropriate mapping files.
271#
272# Examples of encodings that may be converted using singlebyte2unicode are
273# the iso-8859 and windows-125* series.
274sub singlebyte2unicode {
275 my ($encodename, $textref) = @_;
276
277 my @outtext = ();
278 my $len = length($$textref);
279 my ($c);
280 my $i = 0;
281
282 while ($i < $len) {
283 if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
284 # normal ascii character
285 push (@outtext, $c);
286 } else {
287 $c = &transchar ($encodename, $c);
288 # put a black square if cannot translate
289 $c = 0x25A1 if $c == 0;
290 push (@outtext, $c);
291 }
292 $i ++;
293 }
294 return \@outtext;
295}
296
297# doublebyte2unicode converts simple two byte encodings where characters
298# below code point 0x80 are single-byte characters and the rest are
299# double-byte characters.
300#
301# Examples of encodings that may be converted using doublebyte2unicode are
302# CJK encodings like GB encoded Chinese and UHC Korean.
303#
304# Note that no error checking is performed to make sure that the input text
305# is valid for the given encoding.
306#
307# Also, encodings that may contain characters of more than two bytes are
308# not supported (any EUC encoded text may in theory contain 3-byte
309# characters but in practice only one and two byte characters are used).
310sub doublebyte2unicode {
311 my ($encodename, $textref) = @_;
312
313 my @outtext = ();
314 my $len = length($$textref);
315 my ($c1, $c2);
316 my $i = 0;
317
318 while ($i < $len) {
319 if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
320 if ($i+1 < $len) {
321 # double-byte character
322 $c2 = ord(substr($$textref, $i+1, 1));
323 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
324 # put a black square if cannot translate
325 $c = 0x25A1 if $c == 0;
326 push (@outtext, $c);
327 $i += 2;
328
329 } else {
330 # error
331 print STDERR "unicode: ERROR missing second half of double-byte character\n";
332 $i++;
333 }
334
335 } else {
336 # single-byte character
337 push (@outtext, $c1);
338 $i++;
339 }
340 }
341 return \@outtext;
342}
343
344# Shift-JIS to unicode
345# We can't use doublebyte2unicode for Shift-JIS because it uses some
346# single-byte characters above code point 0x80 (i.e. half-width katakana
347# characters in the range 0xA1-0xDF)
348sub shiftjis2unicode {
349 my ($encodename, $textref) = @_;
350
351 my @outtext = ();
352 my $len = length($$textref);
353 my ($c1, $c2);
354 my $i = 0;
355
356 while ($i < $len) {
357 $c1 = ord(substr($$textref, $i, 1));
358
359 if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
360 # Single-byte half-width katakana character or
361 # JIS Roman yen or overline characters
362 my $c = &transchar ($encodename, $c1);
363 # - put a black square if cannot translate
364 $c = 0x25A1 if $c == 0;
365 push (@outtext, $c);
366 $i++;
367
368 } elsif ($c1 < 0x80) {
369 # ASCII
370 push (@outtext, $c1);
371 $i ++;
372
373 } elsif ($c1 < 0xEF) {
374 if ($i+1 < $len) {
375 $c2 = ord(substr($$textref, $i+1, 1));
376 if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
377 # Double-byte shift-jis character
378 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
379 # put a black square if cannot translate
380 $c = 0x25A1 if $c == 0;
381 push (@outtext, $c);
382 } else {
383 # error
384 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
385 }
386 $i += 2;
387 } else {
388 # error
389 print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
390 $i ++;
391 }
392 } else {
393 # error
394 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
395 $i ++;
396 }
397 }
398 return \@outtext;
399}
400
401sub transchar {
402 my ($encoding, $from) = @_;
403 my $high = ($from / 256) % 256;
404 my $low = $from % 256;
405
406 return 0 unless defined $unicode::translations{$encoding};
407
408 my $block = $unicode::translations{$encoding}->{'map'};
409
410 if (ref ($block->[$high]) ne "ARRAY") {
411 return 0;
412 }
413 return $block->[$high]->[$low];
414}
415
416# %translations is of the form:
417#
418# encodings{encodingname-encodingname}->{'map'}->blocktranslation
419# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
420#
421# Any of the top translation blocks can point to an undefined
422# value. This data structure aims to allow fast translation and
423# efficient storage.
424%unicode::translations = ();
425
426# @array256 is used for initialisation, there must be
427# a better way...
428# What about this?: @array256 = (0) x 256;
429@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
430 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
431 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
432 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
433 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
434 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
435 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
436 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
437 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
438 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
439 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
440 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
441 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
442 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
443 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
444 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
445
446# returns 1 if successful, 0 if unsuccessful
447sub loadmapencoding {
448 my ($encoding, $mapfile) = @_;
449
450 # check to see if the encoding has already been loaded
451 return 1 if (defined $unicode::translations{$encoding});
452
453 if (! -r $mapfile || -d $mapfile) {
454 return 0;
455 }
456 return 0 unless open (MAPFILE, $mapfile);
457 binmode (MAPFILE);
458
459 $unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
460 my $block = $unicode::translations{$encoding};
461
462 my ($in,$i,$j);
463 while (1) {
464 my $ret=read(MAPFILE, $in, 1);
465 if (!defined($ret)) { # error
466 print STDERR "unicode.pm: error reading mapfile: $!\n";
467 last;
468 }
469 if ($ret != 1) { last }
470 $i = unpack ("C", $in);
471 $block->{'map'}->[$i] = [@unicode::array256];
472 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
473 my ($n1, $n2) = unpack ("CC", $in);
474 $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
475 }
476 $block->{'count'} ++;
477 }
478
479 close (MAPFILE);
480}
481
482# unicode2singlebyte converts unicode to simple 8 bit encodings where
483# characters below 0x80 are normal ascii characters and the rest are encoded
484# using the appropriate mapping files.
485#
486# Examples of encodings that may be converted using unicode2singlebyte are
487# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
488sub unicode2singlebyte {
489 my ($uniref, $encoding) = @_;
490
491 my $outtext = "";
492 my $encodename = "unicode-$encoding";
493
494 if (!exists $encodings::encodings->{$encoding}) {
495 print STDERR "unicode.pm: ERROR - unsupported encoding "
496 . "'$encoding' requested\n";
497 return "";
498 }
499
500 my $enc_info = $encodings::encodings->{$encoding};
501 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
502 "from_uc", $enc_info->{'mapfile'});
503 if (!&loadmapencoding ($encodename, $mapfile)) {
504 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
505 return "";
506 }
507
508 foreach my $c (@$uniref) {
509 if ($c < 0x80) {
510 # normal ascii character
511 $outtext .= chr($c);
512 } else {
513 # extended ascii character
514 $c = &transchar ($encodename, $c);
515
516 # put a question mark if cannot translate
517 if ($c == 0) {
518 $outtext .= "?";
519 } else {
520 $outtext .= chr($c);
521 }
522 }
523 }
524 return $outtext;
525}
526
527
528# this makes sure that the referenced input string is utf8 encoded, and
529# will change/remove bytes that aren't.
530# returns 0 if the text was already utf8, or 1 if text modified to become utf8
531sub ensure_utf8 {
532 my $stringref=shift;
533
534 if (!defined($stringref) || ref($stringref) ne 'SCALAR') {
535 return $stringref;
536 }
537
538 my $value=$$stringref;
539
540 my $non_utf8_found = 0;
541 $value =~ m/^/g; # to set \G
542 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
543 my $highbytes=$1;
544 my $highbyteslength=length($highbytes);
545 # make sure this block of high bytes is utf-8
546 $highbytes =~ /^/g; # set pos()
547 my $byte_replaced = 0;
548 while ($highbytes =~
549 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
550 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
551 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
552 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
553 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
554 )*([\x80-\xff])? !xg
555 ) {
556 # this highbyte is "out-of-place" for valid utf-8
557 my $badbyte=$1;
558 if (!defined $badbyte) {next} # hit end of string
559 my $pos=pos($highbytes);
560 # replace bad byte. assume iso-8859-1 -> utf-8
561 # ascii2utf8 does "extended ascii"... ie iso-8859-1
562 my $replacement=&unicode::ascii2utf8(\$badbyte);
563 substr($highbytes, $pos-1, 1, $replacement);
564 # update the position to continue searching (for \G)
565 pos($highbytes) = $pos+length($replacement)-1;
566 $byte_replaced = 1;
567 }
568 if ($byte_replaced) {
569 # replace this block of high bytes in the $value
570 $non_utf8_found = 1;
571 my $replength=length($highbytes); # we've changed the length
572 my $textpos=pos($value); # pos at end of last match
573 # replace bad bytes with good bytes
574 substr($value, $textpos-$highbyteslength,
575 $highbyteslength, $highbytes);
576 # update the position to continue searching (for \G)
577 pos($value)=$textpos+($replength-$highbyteslength)+1;
578 }
579 }
580
581 $$stringref = $value;
582 return $non_utf8_found;
583}
584
585# Returns true (1) if the given string is utf8 and false (0) if it isn't.
586# Does not modify the string parameter.
587sub check_is_utf8 {
588 my $value=shift;
589
590 if (!defined($value)) {
591 return 0; # not utf8 because it is undefined
592 }
593
594 $value =~ m/^/g; # to set \G
595 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
596 my $highbytes=$1;
597 # make sure this block of high bytes is utf-8
598 $highbytes =~ /^/g; # set pos()
599 while ($highbytes =~
600 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
601 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
602 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
603 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
604 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
605 )*([\x80-\xff])? !xg
606 ) {
607 my $badbyte=$1;
608 if (defined $badbyte) { # not end of string
609 return 0; # non-utf8 found
610 }
611 }
612 }
613
614 return 1;
615}
616
617sub url_encode {
618 my ($text) = @_;
619
620 if (!&is_url_encoded($text)) {
621 $text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
622 # return the url-encoded character entity for underscore back to the entity
623 $text =~ s/%26%23095%3B/&\#095;/g;
624 }
625 return $text;
626}
627
628sub url_decode {
629 my ($text) = @_;
630
631 $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
632 $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
633 $text =~ s/\&\#([0-9]+);/pack('C', $1)/ige;
634
635 return $text;
636}
637
638sub is_url_encoded {
639 my ($text) = @_;
640 return ($text =~ m/\%([0-9A-F]{2})/i) || ($text =~ m/\&\#x([0-9A-F]+;)/i) || ($text =~ m/\&\#([0-9]+;)/i);
641}
642
643# When a filename on the filesystem is already URL-encoded, the
644# URL to it will have %25s in place of every % sign, so that
645# URLs in html pages can refer to the URL-encoded filename.
646# This method changes the URL reference back into the actual
647# (URL-encoded) filename on the filesystem by replacing %25 with %.
648sub url_to_filename {
649 my ($text) =@_;
650 $text =~ s/%25/%/g if &is_url_encoded($text);
651 # DM safing would have replaced underscores with character entity &#095;
652 # in SourceFile meta. Undo any such change to get the filename referred to.
653 $text =~ s/&\#095;/_/g;
654 return $text;
655}
656
657# When a filename on the filesystem is already URL-encoded, the
658# URL to it will have %25s in place of every % sign, so that
659# URLs in html pages can refer to the URL-encoded filename.
660# Given a (URL-encoded) filename on the filesystem, this subroutine
661# returns the URL reference string for it by replacing % with %25.
662# The output string will be the same as the input string if the input
663# already contains one or more %25s. This is to prevent processing
664# a url more than once this way.
665sub filename_to_url {
666 my ($text) = @_;
667
668 if($text !~ m/%25/) {
669 $text =~ s/%/%25/g;
670 }
671 return $text;
672}
673
674sub base64_encode {
675 my ($text) = @_;
676 if(!&conforms_to_mod_base64($text)) {
677 # return entity for underscore to underscore before encoding
678 $text =~ s/&\#095;/_/g;
679
680 $text = &MIME::Base64::encode_base64($text);
681 # base64 encoding may introduce + and / signs,
682 # replacing them with - and _ to ensure it's filename-safe
683 $text =~ s/\+/\-/g; # + -> -
684 $text =~ s/\//\_/g; # / -> _
685 }
686 return $text;
687}
688
689# If the input fits the modified base64 pattern, this will try decoding it.
690# Still, this method does not guarantee the return value is the 'original', only
691# that the result is where the base64 decoding process has been applied once.
692# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
693sub base64_decode {
694 my ($text) = @_;
695 if(&conforms_to_mod_base64($text)) {
696 # base64 encodes certain chars with + and /, but if we'd encoded it, we'd
697 # have replaced them with - and _ respectively. Undo this before decoding.
698 $text =~ s/\-/\+/g; # - -> +
699 $text =~ s/\_/\//g; # _ -> /
700 $text = &MIME::Base64::decode_base64($text);
701 }
702 return $text;
703}
704
705# Returns true if the given string is compatible with a modified version
706# of base64 (where the + and / are replaced with - and _), a format which
707# includes also regular ASCII alphanumeric values. This method does not
708# guarantee that the given string is actually base64 encoded, since it will
709# return true for any simple alphanumeric ASCII string as well.
710sub conforms_to_mod_base64 {
711 my ($text) = @_;
712
713 # need to treat the entity ref for underscore as underscore
714 $text =~ s/&\#095;/_/g;
715
716 # base 64 takes alphanumeric and [=+/],
717 # but we use modified base64 where + and / are replaced with - and _
718 return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
719}
720
721sub substr
722{
723 my ($utf8_string, $offset, $length) = @_;
724
725 my @unicode_string = @{&utf82unicode($utf8_string)};
726 my $unicode_string_length = scalar(@unicode_string);
727
728 my $substr_start = $offset;
729 if ($substr_start >= $unicode_string_length) {
730 return "";
731 }
732
733 my $substr_end = $offset + $length - 1;
734 if ($substr_end >= $unicode_string_length) {
735 $substr_end = $unicode_string_length - 1;
736 }
737
738 my @unicode_substring = @unicode_string[$substr_start..$substr_end];
739 return &unicode2utf8(\@unicode_substring);
740}
741
742# Useful method to print UTF8 (or other unicode) for debugging.
743# Characters that are easily displayed (that is, printable ASCII)
744# are shown as-is, whereas hex values of the unicode code points
745# are shown for all other chars.
746sub debug_unicode_string
747{
748 join("",
749 map { $_ > 128 ? # if wide character...
750 sprintf("\\x{%04X}", $_) : # \x{...}
751 chr($_)
752 } unpack("U*", $_[0])); # unpack Unicode characters
753}
754
755
756sub raw_filename_to_url_encoded
757{
758 my ($str_in) = @_;
759
760 my @url_encoded_chars
761 = map { $_ > 255 ? # Needs to be represent in entity form
762 sprintf("&#x%X;",$_) :
763 $_ > 128 ? # Representable in %XX form
764 sprintf("%%%2X", $_) :
765 chr($_) # otherwise, Ascii char
766 } unpack("U*", $str_in); # Unpack Unicode characters
767
768
769 my $str_out = join("", @url_encoded_chars);
770
771 return $str_out;
772
773}
774
775
776sub url_encoded_to_raw_filename
777{
778 my ($str_in) = @_;
779
780 my $str_out = $str_in;
781
782 $str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
783 $str_out =~ s/&#([0-9]+);/chr($1)/eig;
784 $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
785
786 return $str_out;
787}
788
7891;
Note: See TracBrowser for help on using the repository browser.