Changeset 23371

Show
Ignore:
Timestamp:
02.12.2010 00:36:37 (8 years ago)
Author:
davidb
Message:

Further refinement of code to support HTML linking between documents when using non-ascii names on Windows

Location:
main/trunk/greenstone2/perllib
Files:
5 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/basebuildproc.pm

    r23182 r23371  
    516516 
    517517        # special case for URL metadata 
    518         if ($field =~ /^URL$/i) { 
    519             &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] }); 
     518        if ($field =~ m/^URL$/i) { 
     519            &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle,  
     520                        $value, { 'section' => [ $section_OID ] }); 
    520521        } 
    521522         
  • main/trunk/greenstone2/perllib/ghtml.pm

    r23362 r23371  
    6565sub urlsafe 
    6666{ 
     67    # protect any hash's that are part of an entity, e.g. a 
     68    $_[0] =~ s/&#(.*?);/&%23$1;/g; 
     69 
     70    # and the usual suspects 
    6771    $_[0] =~ s/[\x09\x20\x22\x3c\x3e\x5b\x5c\x5d\x5e\x60\x7b\x7c\x7d\x7e\?\=\&\+_\/]/sprintf("%%%2x", ord($&))/gse; 
    6872} 
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r23363 r23371  
    219219    } 
    220220 
    221     $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html  
     221    # remove any anchor names, e.g. foo.html#name becomes foo.html  
     222    # but watch out for any #'s that are part of entities, such as α 
     223    $link =~ s/([^&])\#.*$/$1/s;  
     224 
    222225    # some links may just be anchor names 
    223226    next unless ($link =~ /\S+/); 
     
    350353    if ($ENV{'GSDLOS'} =~ m/^windows/i) { 
    351354    # this makes life so much easier... perl can cope with unix-style '/'s. 
    352     $base_dir =~ s@(\\)+@/@g; 
    353     $file =~ s@(\\)+@/@g;  
    354     } 
     355    $base_dir =~ s@(\\)+@/@g;  
     356    $file =~ s@(\\)+@/@g;      
     357    } 
     358 
     359    my $filename = &util::filename_cat($base_dir,$file); 
     360    my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir); 
     361    my $upgraded_filename = &util::upgrade_if_dos_filename($filename); 
     362 
     363    if ($ENV{'GSDLOS'} =~ m/^windows/i) { 
     364    # And again 
     365    $upgraded_base_dir =~ s@(\\)+@/@g;   
     366    $upgraded_filename =~ s@(\\)+@/@g; 
     367     
     368    # Need to make sure there is a '/' on the end of upgraded_base_dir 
     369    if ($upgraded_base_dir !~ m/\/$/) { 
     370        $upgraded_base_dir .= "/"; 
     371    } 
     372    } 
     373    my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir); 
    355374     
    356375    # reset per-doc stuff... 
     
    371390 
    372391    my $arrSections = []; 
    373     $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge; 
     392    $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge; 
    374393 
    375394    if (scalar(@$arrSections)) { 
     
    395414    # links, so even if 'file_is_url' is off, still need to store info 
    396415 
    397     my ($tailname,$dirname) = &File::Basename::fileparse($file); 
     416    my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file); 
    398417 
    399418#    my $utf8_file = $self->filename_to_utf8_metadata($file); 
    400419#    $utf8_file =~ s/&\#095;/_/g; 
    401420    my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname); 
    402  
    403 #   if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    404 #       print STDERR "***!! file = $file\n"; 
    405 #       print STDERR "***!! utf8_file = $utf8_file\n"; 
    406 #   } 
    407421 
    408422 
     
    426440    } 
    427441    $web_url =~ s/\\/\//g; 
     442 
     443    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
     444    print STDERR "******* upgraded_file:       $upgraded_file\n"; 
     445    print STDERR "******* adding URL metadata: $utf8_file\n"; 
     446    } 
     447 
     448 
    428449    $doc_obj->add_utf8_metadata($cursection, "URL", $web_url); 
    429450 
     
    463484        # doesn't necessarily mean there are Section tags in 
    464485        # the document 
    465         $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection); 
     486        $self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection); 
    466487        } 
    467488        while ($comment =~ s/$lt(.*?)$gt//s) { 
     
    509530    } 
    510531    if ($cursection ne "") { 
    511         print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n"; 
     532        print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n"; 
    512533    } 
    513534 
     
    517538        if (!$found_something) { 
    518539        if ($self->{'verbosity'} > 2) { 
    519             print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n"; 
     540            print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n"; 
    520541            print $outhandle "          will be processed as a single section document\n"; 
    521542        } 
    522543 
    523544        # go ahead and process single-section document 
    524         $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection); 
     545        $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection); 
    525546 
    526547        # if document contains no Section tags we'll go ahead 
     
    532553 
    533554        } else { 
    534         print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n"; 
     555        print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n"; 
    535556        print $outhandle "          of the final closing </Section> tag. This text will\n"; 
    536557        print $outhandle "          be ignored."; 
     
    552573        # been processed already but we should print the warning 
    553574        # as above and extract metadata 
    554         print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n"; 
     575        print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n"; 
    555576        print $outhandle "          is blank or empty.  Metadata will be assigned if present.\n"; 
    556577        } 
     
    570591 
    571592    # single section document 
    572     $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection); 
     593    $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection); 
    573594    } 
    574595 
     
    778799    ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) { 
    779800 
    780     # If web page didn't give encoding, then default to utf8 
    781     my $content_encoding= $self->{'content_encoding'} || "utf8"; 
    782  
    783     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    784         print STDERR "*** Encoding with $content_encoding href: $href\n"; 
    785     } 
    786  
    787     $href = encode($content_encoding,$href); 
     801    if ($ENV{'GSDLOS'} =~ m/^windows$/) { 
     802 
     803        # Don't do any encoding for now, as not clear what 
     804        # the right thing to do is to support filename 
     805        # encoding on Windows when they are not UTF16 
     806        #  
     807    } 
     808    else { 
     809        # => Unix-based system 
     810 
     811        # If web page didn't give encoding, then default to utf8 
     812        my $content_encoding= $self->{'content_encoding'} || "utf8"; 
     813     
     814        if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
     815        print STDERR "**** Encoding with '$content_encoding', href: $href\n"; 
     816        } 
     817 
     818        $href = encode($content_encoding,$href); 
     819    } 
    788820 
    789821    $href = &unicode::raw_filename_to_url_encoded($href); 
     
    791823 
    792824    &ghtml::urlsafe ($href); 
     825 
    793826    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 
    794         print STDERR "***!!! href=$href\n";     
     827        print STDERR "****** href=$href\n";     
    795828    } 
    796829 
     
    922955    my ($link, $base_dir, $file) = @_; 
    923956 
    924     my ($before_hash, $hash_part) = $link =~ m/^([^\#]*)(\#?.*)$/; 
     957    # strip off hash part, e.g. #foo, but watch out for any entities, e.g. &#x3B1; 
     958    my ($before_hash, $hash_part) = $link =~ m/^(.*?[^&])(\#.*)?$/; 
    925959     
    926960    $hash_part = "" if !defined $hash_part; 
  • main/trunk/greenstone2/perllib/unicode.pm

    r23362 r23371  
    780780    my $str_out = $str_in; 
    781781 
     782    $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig; 
    782783    $str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig; 
    783784    $str_out =~ s/&#([0-9]+);/chr($1)/eig; 
    784     $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig; 
    785785 
    786786    return $str_out; 
  • main/trunk/greenstone2/perllib/util.pm

    r23362 r23371  
    10301030    my ($filename,$within_dir) = @_; 
    10311031     
    1032     my $dirsep = &util::get_dirsep(); 
    1033     if ($within_dir !~ m/$dirsep$/) { 
     1032    if ($within_dir !~ m/[\/\\]$/) { 
     1033    my $dirsep = &util::get_dirsep(); 
    10341034    $within_dir .= $dirsep; 
    10351035    } 
    10361036     
    10371037    $within_dir =~ s/\\/\\\\/g; # escape DOS style file separator 
    1038      
     1038 
    10391039    if ($filename =~ m/^$within_dir(.*)$/) { 
    10401040    $filename = $1; 
     
    10931093sub upgrade_if_dos_filename 
    10941094{ 
    1095     my ($filename_full_path) = @_; 
     1095    my ($filename_full_path,$and_encode) = @_; 
    10961096 
    10971097    if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 
     
    11021102    # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone) 
    11031103    $filename_full_path =~ s/^(.)/\l$1/; 
     1104    if ((defined $and_encode) && ($and_encode)) { 
     1105        $filename_full_path = encode("utf8",$filename_full_path); 
     1106    } 
    11041107    } 
    11051108