Ignore:
Timestamp:
2010-12-01T11:40:36+13:00 (13 years ago)
Author:
davidb
Message:

Additional routines (and few upgraded) to help support Greenstone working with filenames under Windows when then go beyond Latin-1 and start turning up in their DOS abbreviated form (e.g. Test~1.txt)

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/unicode.pm

    r23304 r23362  
    619619   
    620620    if (!&is_url_encoded($text)) {
    621     $text =~ s/([^A-Z0-9\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
     621    $text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
    622622    # return the url-encoded character entity for underscore back to the entity
    623623    $text =~ s/%26%23095%3B/&\#095;/g;
     
    629629    my ($text) = @_;
    630630
    631     $text =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige;
     631    $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
     632    $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
     633    $text =~ s/\&\#([0-9]+);/pack('C', $1)/ige;
     634
    632635    return $text;
    633636}
     
    635638sub is_url_encoded {
    636639    my ($text) = @_;
    637     return ($text =~ m/\%([A-F0-9]{2})/);
     640    return ($text =~ m/\%([0-9A-F]{2})/i) || ($text =~ m/\&\#x([0-9A-F]+;)/i) || ($text =~ m/\&\#([0-9]+;)/i);
    638641}
    639642
     
    756759
    757760    my @url_encoded_chars
    758     = map { $_ > 128 ?                      # if wide character...
    759             sprintf("%%%2X", $_) :  # \x{...}
    760             chr($_)         
    761         } unpack("U*", $str_in);        # unpack Unicode characters
     761    = map { $_ > 255 ?                  # Needs to be represent in entity form
     762            sprintf("&#x%X;",$_) : 
     763            $_ > 128 ?              # Representable in %XX form
     764            sprintf("%%%2X", $_) : 
     765            chr($_)                 # otherwise, Ascii char
     766        } unpack("U*", $str_in); # Unpack Unicode characters
    762767
    763768   
     
    775780    my $str_out = $str_in;
    776781
    777     $str_out =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
     782    $str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
     783    $str_out =~ s/&#([0-9]+);/chr($1)/eig;
     784    $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
    778785
    779786    return $str_out;
Note: See TracChangeset for help on using the changeset viewer.