Context Navigation

← Previous Changeset
Next Changeset →

Changeset 16769

Timestamp:

2008-08-13T16:57:24+12:00 (16 years ago)

Author:

ak19

Message:

Intermediate version (with commented out debug statements). 1. Works with multilingual image filenames; 2. Improved pattern match for identifying href_links and similar (uses pattern match for img src links); 3. Explicit use of m for match statements

File:

: 1 edited

gsdl/trunk/perllib/plugins/HTMLPlugin.pm (modified) (36 diffs)

Legend:

: Unmodified
: Added
: Removed

gsdl/trunk/perllib/plugins/HTMLPlugin.pm

-              r16735
+              r16769
     # check for symbol fonts
     if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
+    if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
         my $font = $1;
         print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
         if ($font !~ /^arial$/i);
+        if ($font !~ m/^arial$/i);
+    }
 …
     # remove tags without a starting tag from the section
     my ($tag, $tagstart);
     while ($section =~ /<\/([^>]{1,10})>/) {
+    while ($section =~ m/<\/([^>]{1,10})>/) {
     $tag = $1;
     $tagstart = index($section, "<$tag");
 …
     my $f_separator = &util::get_os_dirsep();
     if ($dirname =~ /import$f_separator/)
+    if ($dirname =~ m/import$f_separator/)
+    {
         $test_dirname = $'; #'
 …
     #print STDERR "init $'\n";
     while ($test_dirname =~ /[$f_separator]/)
+    while ($test_dirname =~ m/[$f_separator]/)
+    {
         my $folderdirname = $`;
 …
     # set the file to be tidied
     $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ /\w/;
+    $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
     # get the tidied file
 …
     # read in file ($text will be in utf8)
+    my $text = "";
+    $self->read_file ($filename_full_path, $encoding, $language, \$text);
+    my $textref = \$text;
+    my $raw_text = "";
+    $self->read_file_no_decoding ($filename_full_path, \$raw_text);
+    my $textref = \$raw_text;
     my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
     my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
 …
     my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
+    if(!defined $self->{'utf8_to_original_filename'}) {
+    # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
+    $self->{'utf8_to_original_filename'} = {};
+    }
     foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
 …
+    }
     $link = $self->eval_dir_dots($link);
-    # Replace %XX's in URL with decoded value if required.
-    # Note that the filename may include the %XX in some situations
-    if ($link =~ m/\%[A-F0-9]{2}/i) {
-        if (!-e $link) {
-        $link =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige;
+        }
+    }
+    $block_hash->{'file_blocks'}->{$link} = 1;
+    }
+    # this is the actual filename on the filesystem (that the link refers to)
+    my $url_original_filename = $self->opt_url_decode($link);
+    # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
+    my $utf8_link = "";
+    $self->decode_text($link,$encoding,$language,\$utf8_link);
+### my $utf8_url_encoded_link = &unicode::url_encode($utf8_link);
+    $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
+### $self->{'utf8_to_original_filename'}->{$utf8_url_encoded_link} = $url_original_filename;
+#   print STDERR "**** Storing block link: $link\n";
+#   print STDERR "**** URL original filename: $url_original_filename\n";
+    print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
+    $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
+    }
+}
+# Given a filename in any encoding, will URL decode it to get back the original filename
+# in the original encoding. Because this method is intended to work out the *original*
+# filename*, it not URL decode any filename if a file by the name of the *URL-encoded*
+# string already exists in the local folder.
+# Return the original filename corresponding to the parameter URL-encoded filename, and
+# a decoded flag that is set to true iff URL-decoding had to be applied.
+sub opt_url_decode {
+    my $self = shift (@_);
+    my ($link) = @_;
+    # Replace %XX's in URL with decoded value if required.
+    # Note that the filename may include the %XX in some situations
+    if ($link =~ m/\%[A-F0-9]{2}/i) {
+    if (!-e $link) {
+        $link = &unicode::url_decode($link);
+    }
+    }
+    return $link;
+}
 …
     my $outhandle = $self->{'outhandle'};
     if ($ENV{'GSDLOS'} =~ /^windows/i) {
+    if ($ENV{'GSDLOS'} =~ m/^windows/i) {
     # this makes life so much easier... perl can cope with unix-style '/'s.
     $base_dir =~ s@(\\)+@/@g;
 …
     # links, so even if 'file_is_url' is off, still need to store info
+#    print STDERR "#### file: $file\n";
     my ($tailname,$dirname,$suffix) = &File::Basename::fileparse($file, "\\.[^\\.]+\$");
     my $utf8_file = $self->filename_to_utf8_metadata($file);
+#    $utf8_file = &unicode::url_encode($utf8_file);
     my $web_url = "http://";
     if(defined $dirname) { # local directory
 …
     $web_url = $web_url.$utf8_file;
+    }
+#    print STDERR "#### weburl: $web_url\n";
     $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
 …
             $found_something = 1;
             $cursection = $doc_obj->get_parent_section ($cursection);
         } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
+        } elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
             my $metaname = $1;
             my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
+            my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
             $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
             my $metavalue = $1;
 …
             $metavalue =~ s/[\cJ\cM]/ /sg;
             $metavalue =~ s/<[^>]+>//sg
             unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ /^($dont_strip)$/);
+            unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ m/^($dont_strip)$/);
             $metavalue =~ s/\s+/ /sg;
             if ($accumulate) {
 …
     $$textref =~ s/^.*?<body[^>]*>//is;
     $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
     if ($$textref =~ /\S/) {
+    if ($$textref =~ m/\S/) {
         if (!$found_something) {
         if ($self->{'verbosity'} > 2) {
 …
     if (!$self->{'nolinks'}) {
     # usemap="./#index" not handled correctly => change to "#index"
+    $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
+##  $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
+    $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
         $self->replace_usemap_links($1, $2, $3)/isge;
+    $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
+##  $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
+    $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
         $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
+    }
 …
     #   absolute paths for the images, and without the "file://" prefix
     # So check for this special case and massage the data to be correct
     if ($ENV{'GSDLOS'} =~ /^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ /^[A-Za-z]\:\\/) {
+    if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
     $link =~ s/^.*\\([^\\]+)$/$1/;
+    }
 …
     my $img_file =  $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
     print STDERR "**** link = $link\n";
     print STDERR "**** href = $href\n";
     print STDERR "**** img_file = $img_file\n";
+#    print STDERR "**** link = $link\n";
+#    print STDERR "**** href = $href\n";
+#    print STDERR "**** img_file = $img_file\n";
     my $anchor_name = $img_file;
 …
     my $self = shift (@_);
     my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
+    # remove quotes from link at start and end if necessary
+    if ($link=~/^[\"\']/) {
+    $link=~s/^[\"\']//;
+    $link=~s/[\"\']$//;
+    $front.='"';
+    $back="\"$back";
+    }
     # attempt to sort out targets - frames are not handled
 …
     $back =~ s/target=\"?_parent\"?//is;
     return $front . $link . $back if $link =~ /^\#/s;
+    return $front . $link . $back if $link =~ m/^\#/s;
     $link =~ s/\n/ /g;
+    # Find file referred to by $link on file system
+    # This is more complicated than it sounds when char encodings
+    # is taken in to account
+##    &unicode::ensure_utf8(\$link);
+##    $link = &unicode::url_encode($link);
+#    print STDERR "#### filepath: ".&util::filename_cat($base_dir,$file)."\n";
+#    print STDERR "#### link: $link\n";
     my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
+#    print STDERR "#### href: $href\n";
     # href may use '\'s where '/'s should be on Windows
     $href =~ s/\\/\//g;
+    my ($filename) = $href =~ /^(?:.*?):(?:\/\/)?(.*)/;
+##    $href = &unicode::url_decode($href);
+#    print STDERR "#### href again: $href\n";
+    my ($filename) = $href =~ m/^(?:.*?):(?:\/\/)?(.*)/;
 …
     ##### the intermediate page) in the top level window - I'm not sure if that's
     ##### possible - the following line should probably be deleted if that can be done
     return $front . $link . $back if $href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/is;
     if (($rl == 0) || ($filename =~ /$self->{'process_exp'}/) ||
     ($href =~ /\/$/) || ($href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
+    return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is;
+    if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||
+    ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
     &ghtml::urlsafe ($href);
     return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
 …
     $filename = &util::filename_cat($base_dir, $filename);
+    # Replace %XX's in URL with decoded value if required.
+    # Note that the filename may include the %XX in some situations
+    if ($filename =~ m/\%[A-F0-9]{2}/i) {
+    if (!-e $filename) {
+        $filename =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige;
+    }
+    }
+    my ($ext) = $filename =~ /(\.[^\.]*)$/;
+#    print STDERR "**** filename: $filename\n";
+    # Replace %XX's in URL with decoded value if required. Note that the filename may include the %XX in some
+    # situations. If the *original* file's name was in URL encoding, the following method will not decode it.
+    my $utf8_filename = $filename;
+#    print STDERR "*** filename before URL decoding: $filename\n";
+    $filename = $self->opt_url_decode($utf8_filename);
+#    print STDERR "*** filename after URL decoding:  $filename\n\n";
+    # some special processing if the intended filename was converted to utf8, but
+    # the actual file still needs to be renamed
+    if (!-e $filename) {
+    # try the original filename stored in map
+    my $original_filename = $self->{'utf8_to_original_filename'}->{$filename};
+    if (-e $original_filename) {
+        $filename = $original_filename;
+    }
+    }
+    my ($ext) = $filename =~ m/(\.[^\.]*)$/;
     if ($rl == 0) {
     if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) {
+    if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
         return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
+    }
 …
+    }
     if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) {
+    if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
     return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
+    }
 …
     return "_httpdocimg_/$newname";
     } else {
+    ($newname) = $filename =~ /([^\/\\]*)$/;
+    # Make sure this name is a valid utf8 filename
+    ## &unicode::ensure_utf8(\$newname);
+    $newname =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
+    ($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
+#   print STDERR "Before url encoding newname: $newname\n";
+    # Make sure this name uses only ASCII characters
+    # => use URL encoding, as it preserves original encoding
+    $newname = &unicode::url_encode($newname);
+#   print STDERR "After url encoding newname: $newname\n";
+#   print STDERR "*** Real name and converted filename:\n\t$filename\n\t$newname\n";
     $doc_obj->associate_file($filename, $newname, undef, $section);
+    # Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
+    # of filenames, URL-encode the additional percent signs of the URL-encoded filename
     my $newname_url = $newname;
     $newname_url =~ s/%/%25/g;
+    return "_httpdocimg_/$newname_url";
+    }
+}
+    return "_httpdocimg_/$newname_url";
+    }
+}
 …
     my ($link, $base_dir, $file) = @_;
     my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/;
+    my ($before_hash, $hash_part) = $link =~ m/^([^\#]*)(\#?.*)$/;
     $hash_part = "" if !defined $hash_part;
     if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) {
+    if (!defined $before_hash || $before_hash !~ m/[\w\.\/]/) {
     my $outhandle = $self->{'outhandle'};
     print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
 …
     my $type = $1;
     if ($link =~ /^(http|ftp):/i) {
+    if ($link =~ m/^(http|ftp):/i) {
         # Turn url (using /) into file name (possibly using \ on windows)
         my @http_dir_split = split('/', $before_hash);
 …
     # make sure there's a slash on the end if it's a directory
     if ($before_hash !~ /\/$/) {
+    if ($before_hash !~ m/\/$/) {
         $before_hash .= "/" if (-d $linkfilename);
+    }
     return ($type . $before_hash, $hash_part, $rl);
+    } elsif ($link !~ /^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ /^\//) {
+    if ($before_hash =~ s@^/@@ || $before_hash =~ /\\/) {
+    } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) {
+    if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) {
         # the first directory will be the domain name if file_is_url
 …
         # => turn into relative link if this is so!
         if ($ENV{'GSDLOS'} =~ /^windows/i) {
+        if ($ENV{'GSDLOS'} =~ m/^windows/i) {
             # too difficult doing a pattern match with embedded '\'s...
             my $win_before_hash=$before_hash;
 …
         my $dirname = &File::Basename::dirname($file);
         $before_hash = &util::filename_cat($dirname, $before_hash);
+        $before_hash = $self->eval_dir_dots($before_hash);
+        $before_hash = $self->eval_dir_dots($before_hash);
+#       print STDERR "#### before_hash: $before_hash\n";
+    }
     my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
     # make sure there's a slash on the end if it's a directory
     if ($before_hash !~ /\/$/) {
+    if ($before_hash !~ m/\/$/) {
         $before_hash .= "/" if (-d $linkfilename);
+    }
 …
     # support tag<tagname>
     if ($field =~ /^(.*?)<(.*?)>$/) {
+    if ($field =~ m/^(.*?)<(.*?)>$/) {
         # "$2" is the user's preferred gs metadata name
         $find_fields{lc($1)}=$2; # lc = lowercase
 …
     my $found_title = 0;
     # this assumes that ">" won't appear. (I don't think it's allowed to...)
     $html_header =~ /^/; # match the start of the string, for \G assertion
+    $html_header =~ m/^/; # match the start of the string, for \G assertion
     while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
 …
     # find the tag name
     $metatag =~ /(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
+    $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
     $tag=$2;
     # in case they're not using " or ', but they should...
     if (! $tag) {
         $metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
+        $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
         $tag=$1;
+    }
 …
     # find the tag content
     $metatag =~ /content\s*=\s*([\"\'])?(.*?)\1/is;
+    $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is;
     $value=$2;
     if (! $value) {
         $metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
+        $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
         $value=$1;
+    }
 …
     print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
         if ($self->{'verbosity'} > 2);
     if ($tag =~ /date.*/i){
+    if ($tag =~ m/date.*/i){
         $tag = lc($tag);
+    }
 …
     my $title;
     my $from = ""; # for debugging output only
     if ($html_header =~ /<title[^>]*>([^<]+)<\/title[^>]*>/is) {
+    if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) {
         $title = $1;
         $from = "<title> tags";
 …
     foreach my $field (keys %find_fields) {
     if ($field !~ /^tag([a-z0-9]+)$/i) {next}
+    if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
     my $tag = $1;
     if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
 …
     my ($front, $link, $back) = @_;
+    # remove quotes from link at start and end if necessary
+    if ($link=~/^[\"\']/) {
+    $link=~s/^[\"\']//;
+    $link=~s/[\"\']$//;
+    $front.='"';
+    $back="\"$back";
+    }
     $link =~ s/^\.\///;
     return $front . $link . $back;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 16769

Legend:

gsdl/trunk/perllib/plugins/HTMLPlugin.pm

Download in other formats: