Ignore:
Timestamp:
2010-11-09T14:48:28+13:00 (13 years ago)
Author:
davidb
Message:

Routines added that support converting raw filenames (i.e. byte-code strings) to %xx for any non-ASCII characters. Some extra routines for debugging UTF8/Unicode strings added also

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/unicode.pm

    r23285 r23304  
    3939
    4040no strict 'refs';
     41
     42
     43
     44sub utf8decomp
     45{   
     46    my ($str) = @_;
     47
     48    return if (!defined $str);
     49    return "" if ($str eq "");
     50   
     51    my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
     52
     53    my @each_char
     54    = map { ($_ > 255 )
     55           ? # if wide character...
     56             sprintf("\\x{%04X}", $_)
     57           : # \x{...}
     58             (chr($_) =~ m/[[:cntrl:]]/ )
     59             ? # else if control character ...
     60               sprintf("\\x%02X", $_)
     61             : # \x..
     62               quotemeta(chr($_)) # else quoted or as themselves
     63           } @unpacked_chars;
     64   
     65    return join("",@each_char);
     66}
     67
     68
     69sub hex_codepoint {
     70    if (my $char = shift) {
     71        return sprintf '%2.2x', unpack('U0U*', $char);
     72    }
     73}
     74
     75
     76
    4177
    4278# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
     
    714750}
    715751
     752
     753sub raw_filename_to_url_encoded
     754{
     755    my ($str_in) = @_;
     756
     757    my @url_encoded_chars
     758    = map { $_ > 128 ?                      # if wide character...
     759            sprintf("%%%2X", $_) :  # \x{...}
     760            chr($_)         
     761        } unpack("U*", $str_in);        # unpack Unicode characters
     762
     763   
     764    my $str_out = join("", @url_encoded_chars);
     765
     766    return $str_out;
     767
     768}
     769
     770
     771sub url_encoded_to_raw_filename
     772{
     773    my ($str_in) = @_;
     774
     775    my $str_out = $str_in;
     776
     777    $str_out =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
     778
     779    return $str_out;
     780}
     781
    7167821;
Note: See TracChangeset for help on using the changeset viewer.