Changeset 23362


Ignore:
Timestamp:
2010-12-01T11:40:36+13:00 (13 years ago)
Author:
davidb
Message:

Additional routines (and few upgraded) to help support Greenstone working with filenames under Windows when then go beyond Latin-1 and start turning up in their DOS abbreviated form (e.g. Test~1.txt)

Location:
main/trunk/greenstone2/perllib
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/doc.pm

    r23278 r23362  
    6464    # the docsave processor now calls set_lastmodified
    6565
    66     $self->{'source_path'} = $source_filename;
     66    $self->set_source_path($source_filename);
    6767   
    6868    if (defined $source_filename) {
     
    7676    return $self;
    7777}
     78
     79
     80sub set_source_path
     81{
     82    my $self = shift @_;
     83    my ($source_filename) = @_;
     84
     85    if (defined $source_filename) {
     86    # On Windows the source_filename can be in terse DOS format
     87    # e.g. test~1.txt
     88
     89    $self->{'terse_source_path'} = $source_filename;
     90
     91    if (-e $source_filename) {
     92        # See if we can do better for Windows with a filename
     93        if ($ENV{'GSDLOS'} =~ /^windows$/i) {
     94        require Win32;
     95        $self->{'source_path'} = Win32::GetLongPathName($source_filename);
     96        }
     97        else {
     98        # For Unix-based systems, there is no difference between the two
     99        $self->{'source_path'} = $source_filename;
     100        }
     101    }
     102    else {
     103        print STDERR "Warning: In doc::set_source_path(), file\n";
     104        print STDERR "           $source_filename\n";
     105        print STDERR "         does not exist\n";
     106       
     107        # (default) Set it to whatever we were given
     108        $self->{'source_path'} = $source_filename;
     109    }   
     110    }
     111    else {
     112    # Previous code for setting source_path allowed for
     113    # it to be undefined, so continue this practice
     114    $self->{'terse_source_path'} = undef;
     115    $self->{'source_path'} = undef;
     116    }
     117}
     118
     119
     120sub get_source_path
     121{
     122    my $self = shift @_;
     123
     124    return $self->{'terse_source_path'};
     125}
     126
    78127# set lastmodified for OAI purposes, added by GRB, moved by kjdon
    79128sub set_lastmodified {
    80129    my $self = shift (@_);
    81130
    82     my $source_path = $self->{'source_path'};
     131    my $source_path = $self->{'terse_source_path'};
    83132   
    84133    if (defined $source_path && (-e $source_path)) {
     
    207256    my $self = shift (@_);
    208257
    209     return $self->{'source_path'};
     258    return $self->{'terse_source_path'};
    210259}
    211260
  • main/trunk/greenstone2/perllib/ghtml.pm

    r22952 r23362  
    207207
    208208    # a numeric entity
    209     if ($entity =~ /^\#0*(\d+)/) {
    210     my $code=$1;
     209    my $code = undef;
     210    if ($entity =~ m/^\#0*(\d+)$/) {
     211    $code=$1;
     212    }
     213    elsif ($entity =~ m/^\#x([0-9A-F]+)$/i) {
     214    $code=hex($1);
     215    }
     216
     217    if (defined $code) {
    211218    # non-standard Microsoft breakage, as usual
    212219    if ($code < 0x9f) { # code page 1252 uses reserved bytes
  • main/trunk/greenstone2/perllib/unicode.pm

    r23304 r23362  
    619619   
    620620    if (!&is_url_encoded($text)) {
    621     $text =~ s/([^A-Z0-9\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
     621    $text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
    622622    # return the url-encoded character entity for underscore back to the entity
    623623    $text =~ s/%26%23095%3B/&\#095;/g;
     
    629629    my ($text) = @_;
    630630
    631     $text =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige;
     631    $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
     632    $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
     633    $text =~ s/\&\#([0-9]+);/pack('C', $1)/ige;
     634
    632635    return $text;
    633636}
     
    635638sub is_url_encoded {
    636639    my ($text) = @_;
    637     return ($text =~ m/\%([A-F0-9]{2})/);
     640    return ($text =~ m/\%([0-9A-F]{2})/i) || ($text =~ m/\&\#x([0-9A-F]+;)/i) || ($text =~ m/\&\#([0-9]+;)/i);
    638641}
    639642
     
    756759
    757760    my @url_encoded_chars
    758     = map { $_ > 128 ?                      # if wide character...
    759             sprintf("%%%2X", $_) :  # \x{...}
    760             chr($_)         
    761         } unpack("U*", $str_in);        # unpack Unicode characters
     761    = map { $_ > 255 ?                  # Needs to be represent in entity form
     762            sprintf("&#x%X;",$_) : 
     763            $_ > 128 ?              # Representable in %XX form
     764            sprintf("%%%2X", $_) : 
     765            chr($_)                 # otherwise, Ascii char
     766        } unpack("U*", $str_in); # Unpack Unicode characters
    762767
    763768   
     
    775780    my $str_out = $str_in;
    776781
    777     $str_out =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
     782    $str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
     783    $str_out =~ s/&#([0-9]+);/chr($1)/eig;
     784    $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
    778785
    779786    return $str_out;
  • main/trunk/greenstone2/perllib/util.pm

    r23314 r23362  
    2626package util;
    2727
     28use strict;
     29
     30use Encode;
    2831use File::Copy;
    2932use File::Basename;
    30 
    31 use strict;
    32 
    3333
    3434# removes files (but not directories)
     
    513513}
    514514
     515# Primarily for filenames generated by processing
     516# content of HTML files (which are mapped to UTF-8 internally)
     517#
     518# To turn this into an octet string that really exists on the file
     519# system:
     520# 1. don't need to do anything special for Unix-based systems
     521#   (as underlying file system is byte-code)
     522# 2. need to map to short DOS filenames for Windows
     523
     524sub utf8_to_real_filename
     525{
     526    my ($utf8_filename) = @_;
     527
     528    my $real_filename;
     529
     530    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
     531    require Win32;
     532    my $unicode_filename = decode("utf8",$utf8_filename);
     533    $real_filename = Win32::GetShortPathName($unicode_filename);
     534    }
     535    else {
     536    $real_filename = $utf8_filename;
     537    }
     538
     539    return $real_filename;
     540}
     541
     542
     543sub fd_exists
     544{
     545    my $filename_full_path = shift @_;
     546    my $test_op = shift @_ || "-e";
     547
     548    # By default tests for existance of file or directory (-e)
     549    # Can be made more specific by providing second parameter (e.g. -f or -d)
     550
     551    my $exists = 0;
     552
     553    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
     554    require Win32;
     555    my $filename_short_path = Win32::GetShortPathName($filename_full_path);
     556    if (!defined $filename_short_path) {
     557        # Was probably still in UTF8 form (not what is needed on Windows)
     558        my $unicode_filename_full_path = eval "decode(\"utf8\",\$filename_full_path)";
     559        if (defined $unicode_filename_full_path) {
     560        $filename_short_path = Win32::GetShortPathName($unicode_filename_full_path);
     561        }
     562    }
     563    $filename_full_path = $filename_short_path;
     564    }
     565
     566    if (defined $filename_full_path) {
     567    $exists = eval "($test_op \$filename_full_path)";
     568    }
     569
     570    return $exists;
     571}
     572
     573sub file_exists
     574{
     575    my ($filename_full_path) = @_;
     576
     577    return fd_exists($filename_full_path,"-f");
     578}
     579
     580sub dir_exists
     581{
     582    my ($filename_full_path) = @_;
     583
     584    return fd_exists($filename_full_path,"-d");
     585}
    515586
    516587
     
    9341005
    9351006
     1007
    9361008# returns 1 if filename1 and filename2 point to the same
    9371009# file or directory
     
    9531025}
    9541026
     1027
     1028sub filename_within_directory
     1029{
     1030    my ($filename,$within_dir) = @_;
     1031   
     1032    my $dirsep = &util::get_dirsep();
     1033    if ($within_dir !~ m/$dirsep$/) {
     1034    $within_dir .= $dirsep;
     1035    }
     1036   
     1037    $within_dir =~ s/\\/\\\\/g; # escape DOS style file separator
     1038   
     1039    if ($filename =~ m/^$within_dir(.*)$/) {
     1040    $filename = $1;
     1041    }
     1042   
     1043    return $filename;
     1044}
     1045
    9551046sub filename_within_collection
    9561047{
     
    9601051   
    9611052    if (defined $collect_dir) {
    962     my $dirsep = &util::get_dirsep();
    963     if ($collect_dir !~ m/$dirsep$/) {
    964         $collect_dir .= $dirsep;
    965     }
    966    
    967     $collect_dir =~ s/\\/\\\\/g; # escape DOS style file separator
    968    
     1053
    9691054    # if from within GSDLCOLLECTDIR, then remove directory prefix
    9701055    # so source_filename is realative to it.  This is done to aid
     
    9761061    # GSDLCOLLECTDIR subsequently needs to be put back on to turn
    9771062    # it back into a full pathname.
    978    
    979     if ($filename =~ /^$collect_dir(.*)$/) {
    980         $filename = $1;
    981     }
     1063
     1064    $filename = filename_within_directory($filename,$collect_dir);
    9821065    }
    9831066   
    9841067    return $filename;
    9851068}
     1069
     1070sub prettyprint_file
     1071{
     1072    my ($base_dir,$file) = @_;
     1073
     1074    my $filename_full_path = &util::filename_cat($base_dir,$file);
     1075
     1076    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
     1077    require Win32;
     1078
     1079    # For some reason base_dir in the form c:/a/b/c
     1080    # This leads to confusion later on, so turn it back into
     1081    # the more usual Windows form
     1082    $base_dir =~ s/\//\\/g;
     1083    my $long_base_dir = Win32::GetLongPathName($base_dir);
     1084    my $long_full_path = Win32::GetLongPathName($filename_full_path);
     1085
     1086    $file = filename_within_directory($long_full_path,$long_base_dir);
     1087    }
     1088
     1089    return $file;
     1090}
     1091
     1092
     1093sub upgrade_if_dos_filename
     1094{
     1095    my ($filename_full_path) = @_;
     1096
     1097    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
     1098    require Win32;
     1099    # Ensure any DOS-like filename, such as test~1.txt, has been upgraded
     1100    # to its long (Windows) version
     1101    $filename_full_path = Win32::GetLongPathName($filename_full_path);
     1102    # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone)
     1103    $filename_full_path =~ s/^(.)/\l$1/;
     1104    }
     1105
     1106    return $filename_full_path;
     1107}
     1108
    9861109
    9871110sub filename_is_absolute
Note: See TracChangeset for help on using the changeset viewer.