Changeset 16769

Show
Ignore:
Timestamp:
13.08.2008 16:57:24 (11 years ago)
Author:
ak19
Message:

Intermediate version (with commented out debug statements). 1. Works with multilingual image filenames; 2. Improved pattern match for identifying href_links and similar (uses pattern match for img src links); 3. Explicit use of m for match statements

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/HTMLPlugin.pm

    r16735 r16769  
    165165     
    166166    # check for symbol fonts 
    167     if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) { 
     167    if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) { 
    168168        my $font = $1; 
    169169        print STDERR "HBPlug::HB_gettext - warning removed font $font\n"  
    170         if ($font !~ /^arial$/i); 
     170        if ($font !~ m/^arial$/i); 
    171171    } 
    172172 
     
    191191    # remove tags without a starting tag from the section 
    192192    my ($tag, $tagstart); 
    193     while ($section =~ /<\/([^>]{1,10})>/) { 
     193    while ($section =~ m/<\/([^>]{1,10})>/) { 
    194194    $tag = $1; 
    195195    $tagstart = index($section, "<$tag"); 
     
    353353    my $f_separator = &util::get_os_dirsep(); 
    354354     
    355     if ($dirname =~ /import$f_separator/) 
     355    if ($dirname =~ m/import$f_separator/) 
    356356    { 
    357357        $test_dirname = $'; #' 
     
    359359    #print STDERR "init $'\n"; 
    360360     
    361     while ($test_dirname =~ /[$f_separator]/) 
     361    while ($test_dirname =~ m/[$f_separator]/) 
    362362    { 
    363363        my $folderdirname = $`; 
     
    504504     
    505505    # set the file to be tidied 
    506     $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ /\w/; 
     506    $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/; 
    507507     
    508508    # get the tidied file 
     
    572572 
    573573    # read in file ($text will be in utf8) 
    574     my $text = ""; 
    575     $self->read_file ($filename_full_path, $encoding, $language, \$text); 
    576     my $textref = \$text; 
     574    my $raw_text = ""; 
     575    $self->read_file_no_decoding ($filename_full_path, \$raw_text); 
     576 
     577    my $textref = \$raw_text; 
    577578    my $opencom = '(?:<!--|&lt;!(?:&mdash;|&#151;|--))'; 
    578579    my $closecom = '(?:-->|(?:&mdash;|&#151;|--)&gt;)'; 
     
    587588    my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs); 
    588589 
     590    if(!defined $self->{'utf8_to_original_filename'}) {  
     591    # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url 
     592    $self->{'utf8_to_original_filename'} = {}; 
     593    } 
     594 
    589595    foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) { 
    590596 
     
    605611    } 
    606612    $link = $self->eval_dir_dots($link); 
    607  
    608     # Replace %XX's in URL with decoded value if required.  
    609     # Note that the filename may include the %XX in some situations 
    610     if ($link =~ m/\%[A-F0-9]{2}/i) { 
    611         if (!-e $link) { 
    612         $link =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige; 
    613         } 
    614     } 
    615613     
    616     $block_hash->{'file_blocks'}->{$link} = 1; 
    617     } 
     614 
     615    # this is the actual filename on the filesystem (that the link refers to) 
     616    my $url_original_filename = $self->opt_url_decode($link); 
     617 
     618    # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename 
     619    my $utf8_link = ""; 
     620    $self->decode_text($link,$encoding,$language,\$utf8_link); 
     621### my $utf8_url_encoded_link = &unicode::url_encode($utf8_link); 
     622 
     623    $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename; 
     624### $self->{'utf8_to_original_filename'}->{$utf8_url_encoded_link} = $url_original_filename; 
     625 
     626#   print STDERR "**** Storing block link: $link\n"; 
     627#   print STDERR "**** URL original filename: $url_original_filename\n"; 
     628    print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n"; 
     629 
     630    $block_hash->{'file_blocks'}->{$url_original_filename} = 1; 
     631    } 
     632} 
     633 
     634# Given a filename in any encoding, will URL decode it to get back the original filename 
     635# in the original encoding. Because this method is intended to work out the *original*  
     636# filename*, it not URL decode any filename if a file by the name of the *URL-encoded* 
     637# string already exists in the local folder. 
     638# Return the original filename corresponding to the parameter URL-encoded filename, and 
     639# a decoded flag that is set to true iff URL-decoding had to be applied. 
     640sub opt_url_decode { 
     641    my $self = shift (@_); 
     642    my ($link) = @_; 
     643 
     644    # Replace %XX's in URL with decoded value if required. 
     645    # Note that the filename may include the %XX in some situations 
     646    if ($link =~ m/\%[A-F0-9]{2}/i) { 
     647    if (!-e $link) { 
     648        $link = &unicode::url_decode($link); 
     649    } 
     650    }  
     651 
     652    return $link; 
    618653} 
    619654 
     
    625660    my $outhandle = $self->{'outhandle'}; 
    626661 
    627     if ($ENV{'GSDLOS'} =~ /^windows/i) { 
     662    if ($ENV{'GSDLOS'} =~ m/^windows/i) { 
    628663    # this makes life so much easier... perl can cope with unix-style '/'s. 
    629664    $base_dir =~ s@(\\)+@/@g; 
     
    672707    # links, so even if 'file_is_url' is off, still need to store info 
    673708 
     709#    print STDERR "#### file: $file\n"; 
    674710    my ($tailname,$dirname,$suffix) = &File::Basename::fileparse($file, "\\.[^\\.]+\$"); 
    675711    my $utf8_file = $self->filename_to_utf8_metadata($file); 
     712#    $utf8_file = &unicode::url_encode($utf8_file); 
     713 
    676714    my $web_url = "http://"; 
    677715    if(defined $dirname) { # local directory 
     
    680718    $web_url = $web_url.$utf8_file; 
    681719    } 
     720#    print STDERR "#### weburl: $web_url\n"; 
     721 
    682722    $doc_obj->add_utf8_metadata($cursection, "URL", $web_url); 
    683723 
     
    728768            $found_something = 1; 
    729769            $cursection = $doc_obj->get_parent_section ($cursection); 
    730         } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) { 
     770        } elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) { 
    731771            my $metaname = $1; 
    732             my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0; 
     772            my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0; 
    733773            $comment =~ s/^(.*?)$lt\/Metadata$gt//s; 
    734774            my $metavalue = $1; 
     
    743783            $metavalue =~ s/[\cJ\cM]/ /sg; 
    744784            $metavalue =~ s/<[^>]+>//sg 
    745             unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ /^($dont_strip)$/); 
     785            unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ m/^($dont_strip)$/); 
    746786            $metavalue =~ s/\s+/ /sg; 
    747787            if ($accumulate) { 
     
    765805    $$textref =~ s/^.*?<body[^>]*>//is; 
    766806    $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg; 
    767     if ($$textref =~ /\S/) { 
     807    if ($$textref =~ m/\S/) { 
    768808        if (!$found_something) { 
    769809        if ($self->{'verbosity'} > 2) { 
     
    891931    if (!$self->{'nolinks'}) { 
    892932    # usemap="./#index" not handled correctly => change to "#index" 
    893     $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/ 
     933##  $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/ 
     934 
     935    $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/ 
    894936        $self->replace_usemap_links($1, $2, $3)/isge; 
    895937 
    896     $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/ 
     938##  $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/ 
     939 
     940    $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/ 
    897941        $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge; 
    898942    } 
     
    951995    #   absolute paths for the images, and without the "file://" prefix 
    952996    # So check for this special case and massage the data to be correct 
    953     if ($ENV{'GSDLOS'} =~ /^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ /^[A-Za-z]\:\\/) { 
     997    if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) { 
    954998    $link =~ s/^.*\\([^\\]+)$/$1/; 
    955999    } 
     
    9591003    my $img_file =  $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section); 
    9601004 
    961     print STDERR "**** link = $link\n"; 
    962     print STDERR "**** href = $href\n"; 
    963     print STDERR "**** img_file = $img_file\n"; 
     1005#    print STDERR "**** link = $link\n"; 
     1006#    print STDERR "**** href = $href\n"; 
     1007#    print STDERR "**** img_file = $img_file\n"; 
    9641008 
    9651009    my $anchor_name = $img_file; 
     
    9811025    my $self = shift (@_); 
    9821026    my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_; 
     1027 
     1028    # remove quotes from link at start and end if necessary 
     1029    if ($link=~/^[\"\']/) { 
     1030    $link=~s/^[\"\']//; 
     1031    $link=~s/[\"\']$//; 
     1032    $front.='"'; 
     1033    $back="\"$back"; 
     1034    } 
    9831035 
    9841036    # attempt to sort out targets - frames are not handled  
     
    9911043    $back =~ s/target=\"?_parent\"?//is; 
    9921044 
    993     return $front . $link . $back if $link =~ /^\#/s; 
     1045    return $front . $link . $back if $link =~ m/^\#/s; 
    9941046    $link =~ s/\n/ /g; 
    9951047 
     1048    # Find file referred to by $link on file system  
     1049    # This is more complicated than it sounds when char encodings 
     1050    # is taken in to account 
     1051##    &unicode::ensure_utf8(\$link); 
     1052##    $link = &unicode::url_encode($link); 
     1053#    print STDERR "#### filepath: ".&util::filename_cat($base_dir,$file)."\n"; 
     1054#    print STDERR "#### link: $link\n"; 
     1055 
    9961056    my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file); 
     1057#    print STDERR "#### href: $href\n"; 
     1058  
    9971059    # href may use '\'s where '/'s should be on Windows 
    9981060    $href =~ s/\\/\//g; 
    9991061 
    1000     my ($filename) = $href =~ /^(?:.*?):(?:\/\/)?(.*)/; 
     1062##    $href = &unicode::url_decode($href); 
     1063#    print STDERR "#### href again: $href\n"; 
     1064    my ($filename) = $href =~ m/^(?:.*?):(?:\/\/)?(.*)/; 
    10011065     
    10021066 
     
    10101074    ##### the intermediate page) in the top level window - I'm not sure if that's  
    10111075    ##### possible - the following line should probably be deleted if that can be done 
    1012     return $front . $link . $back if $href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/is; 
    1013  
    1014  
    1015     if (($rl == 0) || ($filename =~ /$self->{'process_exp'}/) ||  
    1016     ($href =~ /\/$/) || ($href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/i)) { 
     1076    return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is; 
     1077 
     1078 
     1079    if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) ||  
     1080    ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) { 
    10171081    &ghtml::urlsafe ($href); 
    10181082    return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back; 
     
    10401104 
    10411105    $filename = &util::filename_cat($base_dir, $filename); 
    1042      
    1043     # Replace %XX's in URL with decoded value if required.  
    1044     # Note that the filename may include the %XX in some situations 
    1045     if ($filename =~ m/\%[A-F0-9]{2}/i) { 
    1046     if (!-e $filename) { 
    1047         $filename =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige; 
    1048     } 
    1049     } 
    1050  
    1051     my ($ext) = $filename =~ /(\.[^\.]*)$/; 
     1106#    print STDERR "**** filename: $filename\n"; 
     1107    # Replace %XX's in URL with decoded value if required. Note that the filename may include the %XX in some 
     1108    # situations. If the *original* file's name was in URL encoding, the following method will not decode it. 
     1109    my $utf8_filename = $filename; 
     1110     
     1111#    print STDERR "*** filename before URL decoding: $filename\n"; 
     1112    $filename = $self->opt_url_decode($utf8_filename); 
     1113#    print STDERR "*** filename after URL decoding:  $filename\n\n"; 
     1114 
     1115    # some special processing if the intended filename was converted to utf8, but 
     1116    # the actual file still needs to be renamed 
     1117    if (!-e $filename) { 
     1118    # try the original filename stored in map 
     1119    my $original_filename = $self->{'utf8_to_original_filename'}->{$filename}; 
     1120    if (-e $original_filename) { 
     1121        $filename = $original_filename; 
     1122    } 
     1123    } 
     1124     
     1125    my ($ext) = $filename =~ m/(\.[^\.]*)$/; 
    10521126 
    10531127    if ($rl == 0) { 
    1054     if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) { 
     1128    if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) { 
    10551129        return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part; 
    10561130    } 
     
    10601134    } 
    10611135 
    1062     if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) { 
     1136    if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) { 
    10631137    return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part; 
    10641138    } 
     
    10751149    return "_httpdocimg_/$newname"; 
    10761150    } else { 
    1077     ($newname) = $filename =~ /([^\/\\]*)$/; 
    1078     # Make sure this name is a valid utf8 filename 
    1079     ## &unicode::ensure_utf8(\$newname); 
    1080     $newname =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg; 
     1151    ($newname) = $utf8_filename =~ m/([^\/\\]*)$/; 
     1152     
     1153#   print STDERR "Before url encoding newname: $newname\n"; 
     1154    # Make sure this name uses only ASCII characters  
     1155    # => use URL encoding, as it preserves original encoding 
     1156    $newname = &unicode::url_encode($newname); 
     1157#   print STDERR "After url encoding newname: $newname\n"; 
     1158#   print STDERR "*** Real name and converted filename:\n\t$filename\n\t$newname\n"; 
    10811159 
    10821160    $doc_obj->associate_file($filename, $newname, undef, $section); 
    10831161 
     1162    # Since the generated image will be URL-encoded to avoid file-system/browser mess-ups 
     1163    # of filenames, URL-encode the additional percent signs of the URL-encoded filename 
    10841164    my $newname_url = $newname; 
    10851165    $newname_url =~ s/%/%25/g; 
    1086     return "_httpdocimg_/$newname_url"; 
    1087     } 
    1088 } 
    1089  
     1166    return "_httpdocimg_/$newname_url";  
     1167    } 
     1168} 
    10901169 
    10911170 
     
    10941173    my ($link, $base_dir, $file) = @_; 
    10951174 
    1096     my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/; 
     1175    my ($before_hash, $hash_part) = $link =~ m/^([^\#]*)(\#?.*)$/; 
    10971176     
    10981177    $hash_part = "" if !defined $hash_part; 
    1099     if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) { 
     1178    if (!defined $before_hash || $before_hash !~ m/[\w\.\/]/) { 
    11001179    my $outhandle = $self->{'outhandle'}; 
    11011180    print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n" 
     
    11071186    my $type = $1; 
    11081187 
    1109     if ($link =~ /^(http|ftp):/i) { 
     1188    if ($link =~ m/^(http|ftp):/i) { 
    11101189        # Turn url (using /) into file name (possibly using \ on windows) 
    11111190        my @http_dir_split = split('/', $before_hash); 
     
    11211200 
    11221201    # make sure there's a slash on the end if it's a directory 
    1123     if ($before_hash !~ /\/$/) { 
     1202    if ($before_hash !~ m/\/$/) { 
    11241203        $before_hash .= "/" if (-d $linkfilename); 
    11251204    } 
    1126  
    11271205    return ($type . $before_hash, $hash_part, $rl); 
    11281206     
    1129     } elsif ($link !~ /^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ /^\//) { 
    1130     if ($before_hash =~ s@^/@@ || $before_hash =~ /\\/) { 
     1207    } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) { 
     1208 
     1209    if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) { 
    11311210 
    11321211        # the first directory will be the domain name if file_is_url 
     
    11451224        # => turn into relative link if this is so! 
    11461225         
    1147         if ($ENV{'GSDLOS'} =~ /^windows/i) { 
     1226        if ($ENV{'GSDLOS'} =~ m/^windows/i) { 
    11481227            # too difficult doing a pattern match with embedded '\'s... 
    11491228            my $win_before_hash=$before_hash; 
     
    11681247        my $dirname = &File::Basename::dirname($file); 
    11691248        $before_hash = &util::filename_cat($dirname, $before_hash); 
    1170         $before_hash = $self->eval_dir_dots($before_hash); 
     1249        $before_hash = $self->eval_dir_dots($before_hash);               
     1250         
     1251#       print STDERR "#### before_hash: $before_hash\n"; 
    11711252    } 
    11721253 
    11731254    my $linkfilename = &util::filename_cat ($base_dir, $before_hash);  
    11741255    # make sure there's a slash on the end if it's a directory 
    1175     if ($before_hash !~ /\/$/) { 
     1256    if ($before_hash !~ m/\/$/) { 
    11761257        $before_hash .= "/" if (-d $linkfilename); 
    11771258    } 
     
    12241305 
    12251306    # support tag<tagname> 
    1226     if ($field =~ /^(.*?)<(.*?)>$/) { 
     1307    if ($field =~ m/^(.*?)<(.*?)>$/) { 
    12271308        # "$2" is the user's preferred gs metadata name 
    12281309        $find_fields{lc($1)}=$2; # lc = lowercase 
     
    12621343    my $found_title = 0; 
    12631344    # this assumes that ">" won't appear. (I don't think it's allowed to...) 
    1264     $html_header =~ /^/; # match the start of the string, for \G assertion 
     1345    $html_header =~ m/^/; # match the start of the string, for \G assertion 
    12651346     
    12661347    while ($html_header =~ m/\G.*?<meta(.*?)>/sig) { 
     
    12691350 
    12701351    # find the tag name 
    1271     $metatag =~ /(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is; 
     1352    $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is; 
    12721353    $tag=$2; 
    12731354    # in case they're not using " or ', but they should... 
    12741355    if (! $tag) { 
    1275         $metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is; 
     1356        $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is; 
    12761357        $tag=$1; 
    12771358    } 
     
    12871368 
    12881369    # find the tag content 
    1289     $metatag =~ /content\s*=\s*([\"\'])?(.*?)\1/is; 
     1370    $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is; 
    12901371    $value=$2; 
    12911372 
    12921373    if (! $value) { 
    1293         $metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is; 
     1374        $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is; 
    12941375        $value=$1; 
    12951376    } 
     
    13161397    print $outhandle " extracted \"$tag\" metadata \"$value\"\n"  
    13171398        if ($self->{'verbosity'} > 2); 
    1318     if ($tag =~ /date.*/i){ 
     1399    if ($tag =~ m/date.*/i){ 
    13191400        $tag = lc($tag); 
    13201401    } 
     
    13291410    my $title; 
    13301411    my $from = ""; # for debugging output only 
    1331     if ($html_header =~ /<title[^>]*>([^<]+)<\/title[^>]*>/is) { 
     1412    if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) { 
    13321413        $title = $1; 
    13331414        $from = "<title> tags"; 
     
    13681449 
    13691450    foreach my $field (keys %find_fields) { 
    1370     if ($field !~ /^tag([a-z0-9]+)$/i) {next} 
     1451    if ($field !~ m/^tag([a-z0-9]+)$/i) {next} 
    13711452    my $tag = $1; 
    13721453    if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) { 
     
    14301511    my ($front, $link, $back) = @_; 
    14311512 
     1513    # remove quotes from link at start and end if necessary 
     1514    if ($link=~/^[\"\']/) { 
     1515    $link=~s/^[\"\']//; 
     1516    $link=~s/[\"\']$//; 
     1517    $front.='"'; 
     1518    $back="\"$back"; 
     1519    } 
     1520 
    14321521    $link =~ s/^\.\///; 
    14331522    return $front . $link . $back;