Context Navigation

← Previous Change
Next Change →

HTMLPlugin.pm

Timestamp:

2010-12-02T00:36:37+13:00 (13 years ago)

Author:

davidb

Message:

Further refinement of code to support HTML linking between documents when using non-ascii names on Windows

File:

: 1 edited

main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm (modified) (14 diffs)

Legend:

: Unmodified
: Added
: Removed

main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

-              r23363
+              r23371
+    }
+    $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html
+    # remove any anchor names, e.g. foo.html#name becomes foo.html
+    # but watch out for any #'s that are part of entities, such as &#x3B1;
+    $link =~ s/([^&])\#.*$/$1/s;
     # some links may just be anchor names
     next unless ($link =~ /\S+/);
 …
     if ($ENV{'GSDLOS'} =~ m/^windows/i) {
     # this makes life so much easier... perl can cope with unix-style '/'s.
+    $base_dir =~ s@(\\)+@/@g;
+    $file =~ s@(\\)+@/@g;
+    }
+    $base_dir =~ s@(\\)+@/@g;
+    $file =~ s@(\\)+@/@g;
+    }
+    my $filename = &util::filename_cat($base_dir,$file);
+    my $upgraded_base_dir = &util::upgrade_if_dos_filename($base_dir);
+    my $upgraded_filename = &util::upgrade_if_dos_filename($filename);
+    if ($ENV{'GSDLOS'} =~ m/^windows/i) {
+    # And again
+    $upgraded_base_dir =~ s@(\\)+@/@g;
+    $upgraded_filename =~ s@(\\)+@/@g;
+    # Need to make sure there is a '/' on the end of upgraded_base_dir
+    if ($upgraded_base_dir !~ m/\/$/) {
+        $upgraded_base_dir .= "/";
+    }
+    }
+    my $upgraded_file = &util::filename_within_directory($upgraded_filename,$upgraded_base_dir);
     # reset per-doc stuff...
 …
     my $arrSections = [];
     $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $file)/isge;
+    $$textref =~ s/<h([0-9]+)[^>]*>(.*?)<\/h[0-9]+>/$self->process_heading($1, $2, $arrSections, $upgraded_file)/isge;
     if (scalar(@$arrSections)) {
 …
     # links, so even if 'file_is_url' is off, still need to store info
     my ($tailname,$dirname) = &File::Basename::fileparse($file);
+    my ($tailname,$dirname) = &File::Basename::fileparse($upgraded_file);
 #    my $utf8_file = $self->filename_to_utf8_metadata($file);
 #    $utf8_file =~ s/&\#095;/_/g;
     my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname);
-#   if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
-#       print STDERR "***!! file = $file\n";
-#       print STDERR "***!! utf8_file = $utf8_file\n";
-#   }
 …
+    }
     $web_url =~ s/\\/\//g;
+    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
+    print STDERR "******* upgraded_file:       $upgraded_file\n";
+    print STDERR "******* adding URL metadata: $utf8_file\n";
+    }
     $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
 …
         # doesn't necessarily mean there are Section tags in
         # the document
         $self->process_section(\$text, $base_dir, $file, $doc_obj, $cursection);
+        $self->process_section(\$text, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
+        }
         while ($comment =~ s/$lt(.*?)$gt//s) {
 …
+    }
     if ($cursection ne "") {
         print $outhandle "HTMLPlugin: WARNING: $file contains unmatched <Section></Section> tags\n";
+        print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains unmatched <Section></Section> tags\n";
+    }
 …
         if (!$found_something) {
         if ($self->{'verbosity'} > 2) {
             print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags so\n";
+            print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags so\n";
             print $outhandle "          will be processed as a single section document\n";
+        }
         # go ahead and process single-section document
         $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
+        $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
         # if document contains no Section tags we'll go ahead
 …
         } else {
         print $outhandle "HTMLPlugin: WARNING: $file contains the following text outside\n";
+        print $outhandle "HTMLPlugin: WARNING: $upgraded_file contains the following text outside\n";
         print $outhandle "          of the final closing </Section> tag. This text will\n";
         print $outhandle "          be ignored.";
 …
         # been processed already but we should print the warning
         # as above and extract metadata
         print $outhandle "HTMLPlugin: WARNING: $file appears to contain no Section tags and\n";
+        print $outhandle "HTMLPlugin: WARNING: $upgraded_file appears to contain no Section tags and\n";
         print $outhandle "          is blank or empty.  Metadata will be assigned if present.\n";
+        }
 …
     # single section document
     $self->process_section($textref, $base_dir, $file, $doc_obj, $cursection);
+    $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection);
+    }
 …
     ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
+    # If web page didn't give encoding, then default to utf8
+    my $content_encoding= $self->{'content_encoding'} || "utf8";
+    if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
+        print STDERR "*** Encoding with $content_encoding href: $href\n";
+    }
+    $href = encode($content_encoding,$href);
+    if ($ENV{'GSDLOS'} =~ m/^windows$/) {
+        # Don't do any encoding for now, as not clear what
+        # the right thing to do is to support filename
+        # encoding on Windows when they are not UTF16
+        #
+    }
+    else {
+        # => Unix-based system
+        # If web page didn't give encoding, then default to utf8
+        my $content_encoding= $self->{'content_encoding'} || "utf8";
+        if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
+        print STDERR "**** Encoding with '$content_encoding', href: $href\n";
+        }
+        $href = encode($content_encoding,$href);
+    }
     $href = &unicode::raw_filename_to_url_encoded($href);
 …
     &ghtml::urlsafe ($href);
     if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {
         print STDERR "***!!! href=$href\n";
+        print STDERR "****** href=$href\n";
+    }
 …
     my ($link, $base_dir, $file) = @_;
+    my ($before_hash, $hash_part) = $link =~ m/^([^\#]*)(\#?.*)$/;
+    # strip off hash part, e.g. #foo, but watch out for any entities, e.g. &#x3B1;
+    my ($before_hash, $hash_part) = $link =~ m/^(.*?[^&])(\#.*)?$/;
     $hash_part = "" if !defined $hash_part;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 23371 for main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

Legend:

main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm

Download in other formats: