Changeset 23371 for main/trunk


Ignore:
Timestamp:
2010-12-02T00:36:37+13:00 (13 years ago)
Author:
davidb
Message:

Further refinement of code to support HTML linking between documents when using non-ascii names on Windows

Location:
main/trunk/greenstone2/perllib
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/basebuildproc.pm

    r23182 r23371  
    516516
    517517        # special case for URL metadata
    518         if ($field =~ /^URL$/i) {
    519             &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] });
     518        if ($field =~ m/^URL$/i) {
     519            &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle,
     520                        $value, { 'section' => [ $section_OID ] });
    520521        }
    521522       
  • main/trunk/greenstone2/perllib/ghtml.pm

    r23362 r23371  
    6565sub urlsafe
    6666{
     67    # protect any hash's that are part of an entity, e.g. a
     68    $_[0] =~ s/&#(.*?);/&%23$1;/g;
     69
     70    # and the usual suspects
    6771    $_[0] =~ s/[\x09\x20\x22\x3c\x3e\x5b\x5c\x5d\x5e\x60\x7b\x7c\x7d\x7e\?\=\&\+_\/]/sprintf("%%%2x", ord($&))/gse;
    6872}
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r23363 r23371  
    219219    }
    220220
    221     $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
     221    # remove any anchor names, e.g. foo.html#name becomes foo.html
     222    # but watch out for any #'s that are part of entities, such as α
     223    $link =~ s/([^&])\#.*$/$1/s;
     224
    222225    # some links may just be anchor names
    223226    next unless ($link =~ /\S+/);
     
    350353    if ($ENV{'GSDLOS'} =~ m/^windows/i) {
    351354    # this makes life so much easier... perl can cope with unix-style '/'s.
    352     $base_dir =~ s@(\\)+@/@g;
    353     $file =~ s@(\\)+@/@g;
    354     }
     355    $base_dir =~ s@(\\)+@/@g;
     356    $file =~ s@(\\)+@/@g;     
     357    }
     358
     359    my $filename = &util::filename_cat($base_dir,$file);
     360    my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir);
     361    my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
     362
     363    if ($ENV{'GSDLOS'} =~ m/^windows/i) {
     364    # And again
     365    $upgraded_base_dir =~ s@(\\)+@/@g; 
     366    $upgraded_filename =~ s@(\\)+@/@g;
     367   
     368    # Need to make sure there is a '/' on the end of upgraded_base_dir
     369    if ($upgraded_base_dir !~ m/\/$/) {
     370        $upgraded_base_dir .= "/";
     371    }
     372    }
     373    my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir);
    355374   
    356375    # reset per-doc stuff...
     
    371390
    372391    my $arrSections = [];
    373     $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
     392    $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge;
    374393
    375394    if (scalar(@$arrSections)) {
     
    395414    # links, so even if 'file_is_url' is off, still need to store info
    396415
    397     my ($tailname,$dirname) = &File::Basename::fileparse($file);
     416    my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file);
    398417
    399418#    my $utf8_file = $self->filename_to_utf8_metadata($file);
    400419#    $utf8_file =~ s/&\#095;/_/g;
    401420    my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname);
    402 
    403 #   if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    404 #       print STDERR "***!! file = $file\n";
    405 #       print STDERR "***!! utf8_file = $utf8_file\n";
    406 #   }
    407421
    408422
     
    426440    }
    427441    $web_url =~ s/\\/\//g;
     442
     443    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
     444    print STDERR "******* upgraded_file:       $upgraded_file\n";
     445    print STDERR "******* adding URL metadata: $utf8_file\n";
     446    }
     447
     448
    428449    $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
    429450
     
    463484        # doesn't necessarily mean there are Section tags in
    464485        # the document
    465         $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
     486        $self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
    466487        }
    467488        while ($comment =~ s/$lt(.*?)$gt//s) {
     
    509530    }
    510531    if ($cursection ne "") {
    511         print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
     532        print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n";
    512533    }
    513534
     
    517538        if (!$found_something) {
    518539        if ($self->{'verbosity'} > 2) {
    519             print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
     540            print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n";
    520541            print $outhandle "          will be processed as a single section document\n";
    521542        }
    522543
    523544        # go ahead and process single-section document
    524         $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
     545        $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
    525546
    526547        # if document contains no Section tags we'll go ahead
     
    532553
    533554        } else {
    534         print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
     555        print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n";
    535556        print $outhandle "          of the final closing </Section> tag. This text will\n";
    536557        print $outhandle "          be ignored.";
     
    552573        # been processed already but we should print the warning
    553574        # as above and extract metadata
    554         print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
     575        print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n";
    555576        print $outhandle "          is blank or empty.  Metadata will be assigned if present.\n";
    556577        }
     
    570591
    571592    # single section document
    572     $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
     593    $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
    573594    }
    574595
     
    778799    ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
    779800
    780     # If web page didn't give encoding, then default to utf8
    781     my $content_encoding= $self->{'content_encoding'} || "utf8";
    782 
    783     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    784         print STDERR "*** Encoding with $content_encoding href: $href\n";
    785     }
    786 
    787     $href = encode($content_encoding,$href);
     801    if ($ENV{'GSDLOS'} =~ m/^windows$/) {
     802
     803        # Don't do any encoding for now, as not clear what
     804        # the right thing to do is to support filename
     805        # encoding on Windows when they are not UTF16
     806        #
     807    }
     808    else {
     809        # => Unix-based system
     810
     811        # If web page didn't give encoding, then default to utf8
     812        my $content_encoding= $self->{'content_encoding'} || "utf8";
     813   
     814        if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
     815        print STDERR "**** Encoding with '$content_encoding', href: $href\n";
     816        }
     817
     818        $href = encode($content_encoding,$href);
     819    }
    788820
    789821    $href = &unicode::raw_filename_to_url_encoded($href);
     
    791823
    792824    &ghtml::urlsafe ($href);
     825
    793826    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    794         print STDERR "***!!! href=$href\n";   
     827        print STDERR "****** href=$href\n";   
    795828    }
    796829
     
    922955    my ($link, $base_dir, $file) = @_;
    923956
    924     my ($before_hash, $hash_part) = $link =~ m/^([^\#]*)(\#?.*)$/;
     957    # strip off hash part, e.g. #foo, but watch out for any entities, e.g. &#x3B1;
     958    my ($before_hash, $hash_part) = $link =~ m/^(.*?[^&])(\#.*)?$/;
    925959   
    926960    $hash_part = "" if !defined $hash_part;
  • main/trunk/greenstone2/perllib/unicode.pm

    r23362 r23371  
    780780    my $str_out = $str_in;
    781781
     782    $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
    782783    $str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
    783784    $str_out =~ s/&#([0-9]+);/chr($1)/eig;
    784     $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
    785785
    786786    return $str_out;
  • main/trunk/greenstone2/perllib/util.pm

    r23362 r23371  
    10301030    my ($filename,$within_dir) = @_;
    10311031   
    1032     my $dirsep = &util::get_dirsep();
    1033     if ($within_dir !~ m/$dirsep$/) {
     1032    if ($within_dir !~ m/[\/\\]$/) {
     1033    my $dirsep = &util::get_dirsep();
    10341034    $within_dir .= $dirsep;
    10351035    }
    10361036   
    10371037    $within_dir =~ s/\\/\\\\/g; # escape DOS style file separator
    1038    
     1038
    10391039    if ($filename =~ m/^$within_dir(.*)$/) {
    10401040    $filename = $1;
     
    10931093sub upgrade_if_dos_filename
    10941094{
    1095     my ($filename_full_path) = @_;
     1095    my ($filename_full_path,$and_encode) = @_;
    10961096
    10971097    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
     
    11021102    # Make sure initial drive letter is lower-case (to fit in with rest of Greenstone)
    11031103    $filename_full_path =~ s/^(.)/\l$1/;
     1104    if ((defined $and_encode) && ($and_encode)) {
     1105        $filename_full_path = encode("utf8",$filename_full_path);
     1106    }
    11041107    }
    11051108
Note: See TracChangeset for help on using the changeset viewer.