Ignore:
Timestamp:
2010-12-02T00:36:37+13:00 (13 years ago)
Author:
davidb
Message:

Further refinement of code to support HTML linking between documents when using non-ascii names on Windows

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

    r23363 r23371  
    219219    }
    220220
    221     $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
     221    # remove any anchor names, e.g. foo.html#name becomes foo.html
     222    # but watch out for any #'s that are part of entities, such as α
     223    $link =~ s/([^&])\#.*$/$1/s;
     224
    222225    # some links may just be anchor names
    223226    next unless ($link =~ /\S+/);
     
    350353    if ($ENV{'GSDLOS'} =~ m/^windows/i) {
    351354    # this makes life so much easier... perl can cope with unix-style '/'s.
    352     $base_dir =~ s@(\\)+@/@g;
    353     $file =~ s@(\\)+@/@g;
    354     }
     355    $base_dir =~ s@(\\)+@/@g;
     356    $file =~ s@(\\)+@/@g;     
     357    }
     358
     359    my $filename = &util::filename_cat($base_dir,$file);
     360    my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir);
     361    my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
     362
     363    if ($ENV{'GSDLOS'} =~ m/^windows/i) {
     364    # And again
     365    $upgraded_base_dir =~ s@(\\)+@/@g; 
     366    $upgraded_filename =~ s@(\\)+@/@g;
     367   
     368    # Need to make sure there is a '/' on the end of upgraded_base_dir
     369    if ($upgraded_base_dir !~ m/\/$/) {
     370        $upgraded_base_dir .= "/";
     371    }
     372    }
     373    my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir);
    355374   
    356375    # reset per-doc stuff...
     
    371390
    372391    my $arrSections = [];
    373     $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
     392    $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge;
    374393
    375394    if (scalar(@$arrSections)) {
     
    395414    # links, so even if 'file_is_url' is off, still need to store info
    396415
    397     my ($tailname,$dirname) = &File::Basename::fileparse($file);
     416    my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file);
    398417
    399418#    my $utf8_file = $self->filename_to_utf8_metadata($file);
    400419#    $utf8_file =~ s/&\#095;/_/g;
    401420    my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname);
    402 
    403 #   if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    404 #       print STDERR "***!! file = $file\n";
    405 #       print STDERR "***!! utf8_file = $utf8_file\n";
    406 #   }
    407421
    408422
     
    426440    }
    427441    $web_url =~ s/\\/\//g;
     442
     443    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
     444    print STDERR "******* upgraded_file:       $upgraded_file\n";
     445    print STDERR "******* adding URL metadata: $utf8_file\n";
     446    }
     447
     448
    428449    $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
    429450
     
    463484        # doesn't necessarily mean there are Section tags in
    464485        # the document
    465         $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
     486        $self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
    466487        }
    467488        while ($comment =~ s/$lt(.*?)$gt//s) {
     
    509530    }
    510531    if ($cursection ne "") {
    511         print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
     532        print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n";
    512533    }
    513534
     
    517538        if (!$found_something) {
    518539        if ($self->{'verbosity'} > 2) {
    519             print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
     540            print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n";
    520541            print $outhandle "          will be processed as a single section document\n";
    521542        }
    522543
    523544        # go ahead and process single-section document
    524         $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
     545        $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
    525546
    526547        # if document contains no Section tags we'll go ahead
     
    532553
    533554        } else {
    534         print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
     555        print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n";
    535556        print $outhandle "          of the final closing </Section> tag. This text will\n";
    536557        print $outhandle "          be ignored.";
     
    552573        # been processed already but we should print the warning
    553574        # as above and extract metadata
    554         print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
     575        print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n";
    555576        print $outhandle "          is blank or empty.  Metadata will be assigned if present.\n";
    556577        }
     
    570591
    571592    # single section document
    572     $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
     593    $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
    573594    }
    574595
     
    778799    ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
    779800
    780     # If web page didn't give encoding, then default to utf8
    781     my $content_encoding= $self->{'content_encoding'} || "utf8";
    782 
    783     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    784         print STDERR "*** Encoding with $content_encoding href: $href\n";
    785     }
    786 
    787     $href = encode($content_encoding,$href);
     801    if ($ENV{'GSDLOS'} =~ m/^windows$/) {
     802
     803        # Don't do any encoding for now, as not clear what
     804        # the right thing to do is to support filename
     805        # encoding on Windows when they are not UTF16
     806        #
     807    }
     808    else {
     809        # => Unix-based system
     810
     811        # If web page didn't give encoding, then default to utf8
     812        my $content_encoding= $self->{'content_encoding'} || "utf8";
     813   
     814        if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
     815        print STDERR "**** Encoding with '$content_encoding', href: $href\n";
     816        }
     817
     818        $href = encode($content_encoding,$href);
     819    }
    788820
    789821    $href = &unicode::raw_filename_to_url_encoded($href);
     
    791823
    792824    &ghtml::urlsafe ($href);
     825
    793826    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
    794         print STDERR "***!!! href=$href\n";   
     827        print STDERR "****** href=$href\n";   
    795828    }
    796829
     
    922955    my ($link, $base_dir, $file) = @_;
    923956
    924     my ($before_hash, $hash_part) = $link =~ m/^([^\#]*)(\#?.*)$/;
     957    # strip off hash part, e.g. #foo, but watch out for any entities, e.g. &#x3B1;
     958    my ($before_hash, $hash_part) = $link =~ m/^(.*?[^&])(\#.*)?$/;
    925959   
    926960    $hash_part = "" if !defined $hash_part;
Note: See TracChangeset for help on using the changeset viewer.