Ignore:
Timestamp:
2010-12-06T13:15:10+13:00 (13 years ago)
Author:
davidb
Message:

Further changes to deal with documents that use different filename encodings on the file-system. Now sets UTF8URL metadata to perform the cross-document look up. Files stored in doc.pm as associated files are now always raw filenames (rather than potentially UTF8 encoded). Storing of filenames seen by HTMLPlug when scanning for files to block on is now done in Unicode aware strings rather than utf8 but unware strings.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/unicode.pm

    r23371 r23387  
    627627
    628628sub url_decode {
    629     my ($text) = @_;
     629    my ($text,$and_numeric_entities) = @_;
    630630
    631631    $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
    632     $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
    633     $text =~ s/\&\#([0-9]+);/pack('C', $1)/ige;
     632
     633    if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
     634    $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
     635    $text =~ s/\&\#([0-9]+);/pack('C', $1)/ige;
     636    }
    634637
    635638    return $text;
     
    773776}
    774777
    775 
    776778sub url_encoded_to_raw_filename
    777779{
     
    787789}
    788790
     791
     792sub raw_filename_to_utf8_url_encoded
     793{
     794    my ($str_in) = @_;
     795
     796    $str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
     797
     798    my @url_encoded_chars
     799    = map { $_ > 128 ?                  # Representable in %XX form
     800            sprintf("%%%2X", $_) : 
     801            chr($_)                 # otherwise, Ascii char
     802        } unpack("U*", $str_in); # Unpack utf8 characters
     803
     804   
     805    my $str_out = join("", @url_encoded_chars);
     806
     807    return $str_out;
     808
     809}
     810
     811sub utf8_url_encoded_to_raw_filename
     812{
     813    my ($str_in) = @_;
     814
     815    my $utf8_str_out = $str_in;
     816
     817    $utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
     818
     819    my $unicode_str_out = decode("utf8",$utf8_str_out);
     820    my $raw_str_out = utf8::downgrade($unicode_str_out);
     821   
     822    return $raw_str_out;
     823}
     824
     825sub analyze_raw_string
     826{
     827    my ($str_in) = @_;
     828
     829    my $uses_bytecodes = 0;
     830    my $exceeds_bytecodes = 0;
     831
     832    map { $exceeds_bytecodes = 1 if ($_ >= 256);
     833      $uses_bytecodes    = 1 if (($_ >= 128) && ($_ < 256));
     834    } unpack("U*", $str_in); # Unpack Unicode characters
     835
     836    return ($uses_bytecodes,$exceeds_bytecodes);
     837}
     838
     839
    7898401;
Note: See TracChangeset for help on using the changeset viewer.