Changeset 16769


Ignore:
Timestamp:
2008-08-13T16:57:24+12:00 (16 years ago)
Author:
ak19
Message:

Intermediate version (with commented out debug statements). 1. Works with multilingual image filenames; 2. Improved pattern match for identifying href_links and similar (uses pattern match for img src links); 3. Explicit use of m for match statements

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/HTMLPlugin.pm

    r16735 r16769  
    165165   
    166166    # check for symbol fonts
    167     if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
     167    if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {
    168168        my $font = $1;
    169169        print STDERR "HBPlug::HB_gettext - warning removed font $font\n"
    170         if ($font !~ /^arial$/i);
     170        if ($font !~ m/^arial$/i);
    171171    }
    172172
     
    191191    # remove tags without a starting tag from the section
    192192    my ($tag, $tagstart);
    193     while ($section =~ /<\/([^>]{1,10})>/) {
     193    while ($section =~ m/<\/([^>]{1,10})>/) {
    194194    $tag = $1;
    195195    $tagstart = index($section, "<$tag");
     
    353353    my $f_separator = &util::get_os_dirsep();
    354354   
    355     if ($dirname =~ /import$f_separator/)
     355    if ($dirname =~ m/import$f_separator/)
    356356    {
    357357        $test_dirname = $'; #'
     
    359359    #print STDERR "init $'\n";
    360360   
    361     while ($test_dirname =~ /[$f_separator]/)
     361    while ($test_dirname =~ m/[$f_separator]/)
    362362    {
    363363        my $folderdirname = $`;
     
    504504   
    505505    # set the file to be tidied
    506     $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ /\w/;
     506    $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/;
    507507   
    508508    # get the tidied file
     
    572572
    573573    # read in file ($text will be in utf8)
    574     my $text = "";
    575     $self->read_file ($filename_full_path, $encoding, $language, \$text);
    576     my $textref = \$text;
     574    my $raw_text = "";
     575    $self->read_file_no_decoding ($filename_full_path, \$raw_text);
     576
     577    my $textref = \$raw_text;
    577578    my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))';
    578579    my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)';
     
    587588    my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs);
    588589
     590    if(!defined $self->{'utf8_to_original_filename'}) {
     591    # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url
     592    $self->{'utf8_to_original_filename'} = {};
     593    }
     594
    589595    foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {
    590596
     
    605611    }
    606612    $link = $self->eval_dir_dots($link);
    607 
    608     # Replace %XX's in URL with decoded value if required.
    609     # Note that the filename may include the %XX in some situations
    610     if ($link =~ m/\%[A-F0-9]{2}/i) {
    611         if (!-e $link) {
    612         $link =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige;
    613         }
    614     }
    615613   
    616     $block_hash->{'file_blocks'}->{$link} = 1;
    617     }
     614
     615    # this is the actual filename on the filesystem (that the link refers to)
     616    my $url_original_filename = $self->opt_url_decode($link);
     617
     618    # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename
     619    my $utf8_link = "";
     620    $self->decode_text($link,$encoding,$language,\$utf8_link);
     621### my $utf8_url_encoded_link = &unicode::url_encode($utf8_link);
     622
     623    $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename;
     624### $self->{'utf8_to_original_filename'}->{$utf8_url_encoded_link} = $url_original_filename;
     625
     626#   print STDERR "**** Storing block link: $link\n";
     627#   print STDERR "**** URL original filename: $url_original_filename\n";
     628    print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n";
     629
     630    $block_hash->{'file_blocks'}->{$url_original_filename} = 1;
     631    }
     632}
     633
     634# Given a filename in any encoding, will URL decode it to get back the original filename
     635# in the original encoding. Because this method is intended to work out the *original*
     636# filename*, it not URL decode any filename if a file by the name of the *URL-encoded*
     637# string already exists in the local folder.
     638# Return the original filename corresponding to the parameter URL-encoded filename, and
     639# a decoded flag that is set to true iff URL-decoding had to be applied.
     640sub opt_url_decode {
     641    my $self = shift (@_);
     642    my ($link) = @_;
     643
     644    # Replace %XX's in URL with decoded value if required.
     645    # Note that the filename may include the %XX in some situations
     646    if ($link =~ m/\%[A-F0-9]{2}/i) {
     647    if (!-e $link) {
     648        $link = &unicode::url_decode($link);
     649    }
     650    }
     651
     652    return $link;
    618653}
    619654
     
    625660    my $outhandle = $self->{'outhandle'};
    626661
    627     if ($ENV{'GSDLOS'} =~ /^windows/i) {
     662    if ($ENV{'GSDLOS'} =~ m/^windows/i) {
    628663    # this makes life so much easier... perl can cope with unix-style '/'s.
    629664    $base_dir =~ s@(\\)+@/@g;
     
    672707    # links, so even if 'file_is_url' is off, still need to store info
    673708
     709#    print STDERR "#### file: $file\n";
    674710    my ($tailname,$dirname,$suffix) = &File::Basename::fileparse($file, "\\.[^\\.]+\$");
    675711    my $utf8_file = $self->filename_to_utf8_metadata($file);
     712#    $utf8_file = &unicode::url_encode($utf8_file);
     713
    676714    my $web_url = "http://";
    677715    if(defined $dirname) { # local directory
     
    680718    $web_url = $web_url.$utf8_file;
    681719    }
     720#    print STDERR "#### weburl: $web_url\n";
     721
    682722    $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
    683723
     
    728768            $found_something = 1;
    729769            $cursection = $doc_obj->get_parent_section ($cursection);
    730         } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {
     770        } elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) {
    731771            my $metaname = $1;
    732             my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;
     772            my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0;
    733773            $comment =~ s/^(.*?)$lt\/Metadata$gt//s;
    734774            my $metavalue = $1;
     
    743783            $metavalue =~ s/[\cJ\cM]/ /sg;
    744784            $metavalue =~ s/<[^>]+>//sg
    745             unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ /^($dont_strip)$/);
     785            unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ m/^($dont_strip)$/);
    746786            $metavalue =~ s/\s+/ /sg;
    747787            if ($accumulate) {
     
    765805    $$textref =~ s/^.*?<body[^>]*>//is;
    766806    $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg;
    767     if ($$textref =~ /\S/) {
     807    if ($$textref =~ m/\S/) {
    768808        if (!$found_something) {
    769809        if ($self->{'verbosity'} > 2) {
     
    891931    if (!$self->{'nolinks'}) {
    892932    # usemap="./#index" not handled correctly => change to "#index"
    893     $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
     933##  $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
     934
     935    $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
    894936        $self->replace_usemap_links($1, $2, $3)/isge;
    895937
    896     $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
     938##  $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/
     939
     940    $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/
    897941        $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge;
    898942    }
     
    951995    #   absolute paths for the images, and without the "file://" prefix
    952996    # So check for this special case and massage the data to be correct
    953     if ($ENV{'GSDLOS'} =~ /^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ /^[A-Za-z]\:\\/) {
     997    if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) {
    954998    $link =~ s/^.*\\([^\\]+)$/$1/;
    955999    }
     
    9591003    my $img_file =  $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section);
    9601004
    961     print STDERR "**** link = $link\n";
    962     print STDERR "**** href = $href\n";
    963     print STDERR "**** img_file = $img_file\n";
     1005#    print STDERR "**** link = $link\n";
     1006#    print STDERR "**** href = $href\n";
     1007#    print STDERR "**** img_file = $img_file\n";
    9641008
    9651009    my $anchor_name = $img_file;
     
    9811025    my $self = shift (@_);
    9821026    my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_;
     1027
     1028    # remove quotes from link at start and end if necessary
     1029    if ($link=~/^[\"\']/) {
     1030    $link=~s/^[\"\']//;
     1031    $link=~s/[\"\']$//;
     1032    $front.='"';
     1033    $back="\"$back";
     1034    }
    9831035
    9841036    # attempt to sort out targets - frames are not handled
     
    9911043    $back =~ s/target=\"?_parent\"?//is;
    9921044
    993     return $front . $link . $back if $link =~ /^\#/s;
     1045    return $front . $link . $back if $link =~ m/^\#/s;
    9941046    $link =~ s/\n/ /g;
    9951047
     1048    # Find file referred to by $link on file system
     1049    # This is more complicated than it sounds when char encodings
     1050    # is taken in to account
     1051##    &unicode::ensure_utf8(\$link);
     1052##    $link = &unicode::url_encode($link);
     1053#    print STDERR "#### filepath: ".&util::filename_cat($base_dir,$file)."\n";
     1054#    print STDERR "#### link: $link\n";
     1055
    9961056    my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file);
     1057#    print STDERR "#### href: $href\n";
     1058 
    9971059    # href may use '\'s where '/'s should be on Windows
    9981060    $href =~ s/\\/\//g;
    9991061
    1000     my ($filename) = $href =~ /^(?:.*?):(?:\/\/)?(.*)/;
     1062##    $href = &unicode::url_decode($href);
     1063#    print STDERR "#### href again: $href\n";
     1064    my ($filename) = $href =~ m/^(?:.*?):(?:\/\/)?(.*)/;
    10011065   
    10021066
     
    10101074    ##### the intermediate page) in the top level window - I'm not sure if that's
    10111075    ##### possible - the following line should probably be deleted if that can be done
    1012     return $front . $link . $back if $href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/is;
    1013 
    1014 
    1015     if (($rl == 0) || ($filename =~ /$self->{'process_exp'}/) ||
    1016     ($href =~ /\/$/) || ($href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
     1076    return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is;
     1077
     1078
     1079    if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||
     1080    ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {
    10171081    &ghtml::urlsafe ($href);
    10181082    return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back;
     
    10401104
    10411105    $filename = &util::filename_cat($base_dir, $filename);
    1042    
    1043     # Replace %XX's in URL with decoded value if required.
    1044     # Note that the filename may include the %XX in some situations
    1045     if ($filename =~ m/\%[A-F0-9]{2}/i) {
    1046     if (!-e $filename) {
    1047         $filename =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige;
    1048     }
    1049     }
    1050 
    1051     my ($ext) = $filename =~ /(\.[^\.]*)$/;
     1106#    print STDERR "**** filename: $filename\n";
     1107    # Replace %XX's in URL with decoded value if required. Note that the filename may include the %XX in some
     1108    # situations. If the *original* file's name was in URL encoding, the following method will not decode it.
     1109    my $utf8_filename = $filename;
     1110   
     1111#    print STDERR "*** filename before URL decoding: $filename\n";
     1112    $filename = $self->opt_url_decode($utf8_filename);
     1113#    print STDERR "*** filename after URL decoding:  $filename\n\n";
     1114
     1115    # some special processing if the intended filename was converted to utf8, but
     1116    # the actual file still needs to be renamed
     1117    if (!-e $filename) {
     1118    # try the original filename stored in map
     1119    my $original_filename = $self->{'utf8_to_original_filename'}->{$filename};
     1120    if (-e $original_filename) {
     1121        $filename = $original_filename;
     1122    }
     1123    }
     1124   
     1125    my ($ext) = $filename =~ m/(\.[^\.]*)$/;
    10521126
    10531127    if ($rl == 0) {
    1054     if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) {
     1128    if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
    10551129        return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part;
    10561130    }
     
    10601134    }
    10611135
    1062     if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) {
     1136    if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) {
    10631137    return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part;
    10641138    }
     
    10751149    return "_httpdocimg_/$newname";
    10761150    } else {
    1077     ($newname) = $filename =~ /([^\/\\]*)$/;
    1078     # Make sure this name is a valid utf8 filename
    1079     ## &unicode::ensure_utf8(\$newname);
    1080     $newname =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg;
     1151    ($newname) = $utf8_filename =~ m/([^\/\\]*)$/;
     1152   
     1153#   print STDERR "Before url encoding newname: $newname\n";
     1154    # Make sure this name uses only ASCII characters
     1155    # => use URL encoding, as it preserves original encoding
     1156    $newname = &unicode::url_encode($newname);
     1157#   print STDERR "After url encoding newname: $newname\n";
     1158#   print STDERR "*** Real name and converted filename:\n\t$filename\n\t$newname\n";
    10811159
    10821160    $doc_obj->associate_file($filename, $newname, undef, $section);
    10831161
     1162    # Since the generated image will be URL-encoded to avoid file-system/browser mess-ups
     1163    # of filenames, URL-encode the additional percent signs of the URL-encoded filename
    10841164    my $newname_url = $newname;
    10851165    $newname_url =~ s/%/%25/g;
    1086     return "_httpdocimg_/$newname_url";
    1087     }
    1088 }
    1089 
     1166    return "_httpdocimg_/$newname_url";
     1167    }
     1168}
    10901169
    10911170
     
    10941173    my ($link, $base_dir, $file) = @_;
    10951174
    1096     my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/;
     1175    my ($before_hash, $hash_part) = $link =~ m/^([^\#]*)(\#?.*)$/;
    10971176   
    10981177    $hash_part = "" if !defined $hash_part;
    1099     if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) {
     1178    if (!defined $before_hash || $before_hash !~ m/[\w\.\/]/) {
    11001179    my $outhandle = $self->{'outhandle'};
    11011180    print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n"
     
    11071186    my $type = $1;
    11081187
    1109     if ($link =~ /^(http|ftp):/i) {
     1188    if ($link =~ m/^(http|ftp):/i) {
    11101189        # Turn url (using /) into file name (possibly using \ on windows)
    11111190        my @http_dir_split = split('/', $before_hash);
     
    11211200
    11221201    # make sure there's a slash on the end if it's a directory
    1123     if ($before_hash !~ /\/$/) {
     1202    if ($before_hash !~ m/\/$/) {
    11241203        $before_hash .= "/" if (-d $linkfilename);
    11251204    }
    1126 
    11271205    return ($type . $before_hash, $hash_part, $rl);
    11281206   
    1129     } elsif ($link !~ /^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ /^\//) {
    1130     if ($before_hash =~ s@^/@@ || $before_hash =~ /\\/) {
     1207    } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) {
     1208
     1209    if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) {
    11311210
    11321211        # the first directory will be the domain name if file_is_url
     
    11451224        # => turn into relative link if this is so!
    11461225       
    1147         if ($ENV{'GSDLOS'} =~ /^windows/i) {
     1226        if ($ENV{'GSDLOS'} =~ m/^windows/i) {
    11481227            # too difficult doing a pattern match with embedded '\'s...
    11491228            my $win_before_hash=$before_hash;
     
    11681247        my $dirname = &File::Basename::dirname($file);
    11691248        $before_hash = &util::filename_cat($dirname, $before_hash);
    1170         $before_hash = $self->eval_dir_dots($before_hash);
     1249        $before_hash = $self->eval_dir_dots($before_hash);             
     1250       
     1251#       print STDERR "#### before_hash: $before_hash\n";
    11711252    }
    11721253
    11731254    my $linkfilename = &util::filename_cat ($base_dir, $before_hash);
    11741255    # make sure there's a slash on the end if it's a directory
    1175     if ($before_hash !~ /\/$/) {
     1256    if ($before_hash !~ m/\/$/) {
    11761257        $before_hash .= "/" if (-d $linkfilename);
    11771258    }
     
    12241305
    12251306    # support tag<tagname>
    1226     if ($field =~ /^(.*?)<(.*?)>$/) {
     1307    if ($field =~ m/^(.*?)<(.*?)>$/) {
    12271308        # "$2" is the user's preferred gs metadata name
    12281309        $find_fields{lc($1)}=$2; # lc = lowercase
     
    12621343    my $found_title = 0;
    12631344    # this assumes that ">" won't appear. (I don't think it's allowed to...)
    1264     $html_header =~ /^/; # match the start of the string, for \G assertion
     1345    $html_header =~ m/^/; # match the start of the string, for \G assertion
    12651346   
    12661347    while ($html_header =~ m/\G.*?<meta(.*?)>/sig) {
     
    12691350
    12701351    # find the tag name
    1271     $metatag =~ /(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
     1352    $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;
    12721353    $tag=$2;
    12731354    # in case they're not using " or ', but they should...
    12741355    if (! $tag) {
    1275         $metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
     1356        $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
    12761357        $tag=$1;
    12771358    }
     
    12871368
    12881369    # find the tag content
    1289     $metatag =~ /content\s*=\s*([\"\'])?(.*?)\1/is;
     1370    $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is;
    12901371    $value=$2;
    12911372
    12921373    if (! $value) {
    1293         $metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
     1374        $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;
    12941375        $value=$1;
    12951376    }
     
    13161397    print $outhandle " extracted \"$tag\" metadata \"$value\"\n"
    13171398        if ($self->{'verbosity'} > 2);
    1318     if ($tag =~ /date.*/i){
     1399    if ($tag =~ m/date.*/i){
    13191400        $tag = lc($tag);
    13201401    }
     
    13291410    my $title;
    13301411    my $from = ""; # for debugging output only
    1331     if ($html_header =~ /<title[^>]*>([^<]+)<\/title[^>]*>/is) {
     1412    if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) {
    13321413        $title = $1;
    13331414        $from = "<title> tags";
     
    13681449
    13691450    foreach my $field (keys %find_fields) {
    1370     if ($field !~ /^tag([a-z0-9]+)$/i) {next}
     1451    if ($field !~ m/^tag([a-z0-9]+)$/i) {next}
    13711452    my $tag = $1;
    13721453    if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) {
     
    14301511    my ($front, $link, $back) = @_;
    14311512
     1513    # remove quotes from link at start and end if necessary
     1514    if ($link=~/^[\"\']/) {
     1515    $link=~s/^[\"\']//;
     1516    $link=~s/[\"\']$//;
     1517    $front.='"';
     1518    $back="\"$back";
     1519    }
     1520
    14321521    $link =~ s/^\.\///;
    14331522    return $front . $link . $back;
Note: See TracChangeset for help on using the changeset viewer.