Changeset 23362

Show
Ignore:
Timestamp:
01.12.2010 11:40:36 (8 years ago)
Author:
davidb
Message:

Additional routines (and few upgraded) to help support Greenstone working with filenames under Windows when then go beyond Latin-1 and start turning up in their DOS abbreviated form (e.g. Test~1.txt)

Location:
main/trunk/greenstone2/perllib
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/doc.pm

    r23278 r23362  
    6464    # the docsave processor now calls set_lastmodified 
    6565 
    66     $self->{'source_path'} = $source_filename; 
     66    $self->set_source_path($source_filename); 
    6767     
    6868    if (defined $source_filename) { 
     
    7676    return $self; 
    7777} 
     78 
     79 
     80sub set_source_path 
     81{ 
     82    my $self = shift @_; 
     83    my ($source_filename) = @_; 
     84 
     85    if (defined $source_filename) { 
     86    # On Windows the source_filename can be in terse DOS format 
     87    # e.g. test~1.txt 
     88 
     89    $self->{'terse_source_path'} = $source_filename; 
     90 
     91    if (-e $source_filename) { 
     92        # See if we can do better for Windows with a filename 
     93        if ($ENV{'GSDLOS'} =~ /^windows$/i) { 
     94        require Win32; 
     95        $self->{'source_path'} = Win32::GetLongPathName($source_filename); 
     96        } 
     97        else { 
     98        # For Unix-based systems, there is no difference between the two 
     99        $self->{'source_path'} = $source_filename; 
     100        } 
     101    } 
     102    else { 
     103        print STDERR "Warning: In doc::set_source_path(), file\n"; 
     104        print STDERR "           $source_filename\n"; 
     105        print STDERR "         does not exist\n"; 
     106         
     107        # (default) Set it to whatever we were given 
     108        $self->{'source_path'} = $source_filename; 
     109    }    
     110    } 
     111    else { 
     112    # Previous code for setting source_path allowed for 
     113    # it to be undefined, so continue this practice 
     114    $self->{'terse_source_path'} = undef; 
     115    $self->{'source_path'} = undef; 
     116    } 
     117} 
     118 
     119 
     120sub get_source_path 
     121{ 
     122    my $self = shift @_; 
     123 
     124    return $self->{'terse_source_path'}; 
     125} 
     126 
    78127# set lastmodified for OAI purposes, added by GRB, moved by kjdon 
    79128sub set_lastmodified { 
    80129    my $self = shift (@_); 
    81130 
    82     my $source_path = $self->{'source_path'}; 
     131    my $source_path = $self->{'terse_source_path'}; 
    83132    
    84133    if (defined $source_path && (-e $source_path)) { 
     
    207256    my $self = shift (@_); 
    208257 
    209     return $self->{'source_path'}; 
     258    return $self->{'terse_source_path'}; 
    210259} 
    211260 
  • main/trunk/greenstone2/perllib/ghtml.pm

    r22952 r23362  
    207207 
    208208    # a numeric entity 
    209     if ($entity =~ /^\#0*(\d+)/) { 
    210     my $code=$1; 
     209    my $code = undef; 
     210    if ($entity =~ m/^\#0*(\d+)$/) { 
     211    $code=$1; 
     212    } 
     213    elsif ($entity =~ m/^\#x([0-9A-F]+)$/i) { 
     214    $code=hex($1); 
     215    } 
     216 
     217    if (defined $code) { 
    211218    # non-standard Microsoft breakage, as usual 
    212219    if ($code < 0x9f) { # code page 1252 uses reserved bytes 
  • main/trunk/greenstone2/perllib/unicode.pm

    r23304 r23362  
    619619     
    620620    if (!&is_url_encoded($text)) { 
    621     $text =~ s/([^A-Z0-9\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg; 
     621    $text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg; 
    622622    # return the url-encoded character entity for underscore back to the entity 
    623623    $text =~ s/%26%23095%3B/&\#095;/g; 
     
    629629    my ($text) = @_; 
    630630 
    631     $text =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige; 
     631    $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige; 
     632    $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige; 
     633    $text =~ s/\&\#([0-9]+);/pack('C', $1)/ige; 
     634 
    632635    return $text; 
    633636} 
     
    635638sub is_url_encoded { 
    636639    my ($text) = @_; 
    637     return ($text =~ m/\%([A-F0-9]{2})/); 
     640    return ($text =~ m/\%([0-9A-F]{2})/i) || ($text =~ m/\&\#x([0-9A-F]+;)/i) || ($text =~ m/\&\#([0-9]+;)/i); 
    638641} 
    639642 
     
    756759 
    757760    my @url_encoded_chars 
    758     = map { $_ > 128 ?                      # if wide character... 
    759             sprintf("%%%2X", $_) :  # \x{...} 
    760             chr($_)           
    761         } unpack("U*", $str_in);        # unpack Unicode characters 
     761    = map { $_ > 255 ?                  # Needs to be represent in entity form 
     762            sprintf("&#x%X;",$_) :   
     763            $_ > 128 ?              # Representable in %XX form 
     764            sprintf("%%%2X", $_) :   
     765            chr($_)                 # otherwise, Ascii char 
     766        } unpack("U*", $str_in); # Unpack Unicode characters 
    762767 
    763768     
     
    775780    my $str_out = $str_in; 
    776781 
    777     $str_out =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; 
     782    $str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig; 
     783    $str_out =~ s/&#([0-9]+);/chr($1)/eig; 
     784    $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig; 
    778785 
    779786    return $str_out; 
  • main/trunk/greenstone2/perllib/util.pm

    r23314 r23362  
    2626package util; 
    2727 
     28use strict; 
     29 
     30use Encode; 
    2831use File::Copy; 
    2932use File::Basename; 
    30  
    31 use strict; 
    32  
    3333 
    3434# removes files (but not directories) 
     
    513513} 
    514514 
     515# Primarily for filenames generated by processing 
     516# content of HTML files (which are mapped to UTF-8 internally) 
     517# 
     518# To turn this into an octet string that really exists on the file 
     519# system: 
     520# 1. don't need to do anything special for Unix-based systems 
     521#   (as underlying file system is byte-code) 
     522# 2. need to map to short DOS filenames for Windows 
     523 
     524sub utf8_to_real_filename 
     525{ 
     526    my ($utf8_filename) = @_; 
     527 
     528    my $real_filename; 
     529 
     530    if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 
     531    require Win32; 
     532    my $unicode_filename = decode("utf8",$utf8_filename); 
     533    $real_filename = Win32::GetShortPathName($unicode_filename); 
     534    } 
     535    else { 
     536    $real_filename = $utf8_filename; 
     537    } 
     538 
     539    return $real_filename; 
     540} 
     541 
     542 
     543sub fd_exists 
     544{ 
     545    my $filename_full_path = shift @_; 
     546    my $test_op = shift @_ || "-e"; 
     547 
     548    # By default tests for existance of file or directory (-e) 
     549    # Can be made more specific by providing second parameter (e.g. -f or -d) 
     550 
     551    my $exists = 0; 
     552 
     553    if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 
     554    require Win32; 
     555    my $filename_short_path = Win32::GetShortPathName($filename_full_path);  
     556    if (!defined $filename_short_path) { 
     557        # Was probably still in UTF8 form (not what is needed on Windows) 
     558        my $unicode_filename_full_path = eval "decode(\"utf8\",\$filename_full_path)"; 
     559        if (defined $unicode_filename_full_path) { 
     560        $filename_short_path = Win32::GetShortPathName($unicode_filename_full_path); 
     561        } 
     562    } 
     563    $filename_full_path = $filename_short_path; 
     564    } 
     565 
     566    if (defined $filename_full_path) { 
     567    $exists = eval "($test_op \$filename_full_path)"; 
     568    } 
     569 
     570    return $exists; 
     571} 
     572 
     573sub file_exists 
     574{ 
     575    my ($filename_full_path) = @_; 
     576 
     577    return fd_exists($filename_full_path,"-f"); 
     578} 
     579 
     580sub dir_exists 
     581{ 
     582    my ($filename_full_path) = @_; 
     583 
     584    return fd_exists($filename_full_path,"-d"); 
     585} 
    515586 
    516587 
     
    9341005 
    9351006 
     1007 
    9361008# returns 1 if filename1 and filename2 point to the same 
    9371009# file or directory 
     
    9531025} 
    9541026 
     1027 
     1028sub filename_within_directory 
     1029{ 
     1030    my ($filename,$within_dir) = @_; 
     1031     
     1032    my $dirsep = &util::get_dirsep(); 
     1033    if ($within_dir !~ m/$dirsep$/) { 
     1034    $within_dir .= $dirsep; 
     1035    } 
     1036     
     1037    $within_dir =~ s/\\/\\\\/g; # escape DOS style file separator 
     1038     
     1039    if ($filename =~ m/^$within_dir(.*)$/) { 
     1040    $filename = $1; 
     1041    } 
     1042     
     1043    return $filename; 
     1044} 
     1045 
    9551046sub filename_within_collection 
    9561047{ 
     
    9601051     
    9611052    if (defined $collect_dir) { 
    962     my $dirsep = &util::get_dirsep(); 
    963     if ($collect_dir !~ m/$dirsep$/) { 
    964         $collect_dir .= $dirsep; 
    965     } 
    966      
    967     $collect_dir =~ s/\\/\\\\/g; # escape DOS style file separator 
    968      
     1053 
    9691054    # if from within GSDLCOLLECTDIR, then remove directory prefix 
    9701055    # so source_filename is realative to it.  This is done to aid 
     
    9761061    # GSDLCOLLECTDIR subsequently needs to be put back on to turn 
    9771062    # it back into a full pathname. 
    978      
    979     if ($filename =~ /^$collect_dir(.*)$/) { 
    980         $filename = $1; 
    981     } 
     1063 
     1064    $filename = filename_within_directory($filename,$collect_dir); 
    9821065    } 
    9831066     
    9841067    return $filename; 
    9851068} 
     1069 
     1070sub prettyprint_file 
     1071{ 
     1072    my ($base_dir,$file) = @_; 
     1073 
     1074    my $filename_full_path = &util::filename_cat($base_dir,$file); 
     1075 
     1076    if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 
     1077    require Win32; 
     1078 
     1079    # For some reason base_dir in the form c:/a/b/c 
     1080    # This leads to confusion later on, so turn it back into 
     1081    # the more usual Windows form 
     1082    $base_dir =~ s/\//\\/g;  
     1083    my $long_base_dir = Win32::GetLongPathName($base_dir); 
     1084    my $long_full_path = Win32::GetLongPathName($filename_full_path); 
     1085 
     1086    $file = filename_within_directory($long_full_path,$long_base_dir); 
     1087    } 
     1088 
     1089    return $file; 
     1090} 
     1091 
     1092 
     1093sub upgrade_if_dos_filename 
     1094{ 
     1095    my ($filename_full_path) = @_; 
     1096 
     1097    if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 
     1098    require Win32; 
     1099    # Ensure any DOS-like filename, such as test~1.txt, has been upgraded 
     1100    # to its long (Windows) version 
     1101    $filename_full_path = Win32::GetLongPathName($filename_full_path); 
     1102    # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone) 
     1103    $filename_full_path =~ s/^(.)/\l$1/; 
     1104    } 
     1105 
     1106    return $filename_full_path; 
     1107} 
     1108 
    9861109 
    9871110sub filename_is_absolute