Changeset 16769
- Timestamp:
- 2008-08-13T16:57:24+12:00 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/plugins/HTMLPlugin.pm
r16735 r16769 165 165 166 166 # check for symbol fonts 167 if ($line =~ /<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) {167 if ($line =~ m/<font [^>]*?face\s*=\s*\"?(\w+)\"?/i) { 168 168 my $font = $1; 169 169 print STDERR "HBPlug::HB_gettext - warning removed font $font\n" 170 if ($font !~ /^arial$/i);170 if ($font !~ m/^arial$/i); 171 171 } 172 172 … … 191 191 # remove tags without a starting tag from the section 192 192 my ($tag, $tagstart); 193 while ($section =~ /<\/([^>]{1,10})>/) {193 while ($section =~ m/<\/([^>]{1,10})>/) { 194 194 $tag = $1; 195 195 $tagstart = index($section, "<$tag"); … … 353 353 my $f_separator = &util::get_os_dirsep(); 354 354 355 if ($dirname =~ /import$f_separator/)355 if ($dirname =~ m/import$f_separator/) 356 356 { 357 357 $test_dirname = $'; #' … … 359 359 #print STDERR "init $'\n"; 360 360 361 while ($test_dirname =~ /[$f_separator]/)361 while ($test_dirname =~ m/[$f_separator]/) 362 362 { 363 363 my $folderdirname = $`; … … 504 504 505 505 # set the file to be tidied 506 $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ /\w/;506 $input_filename = &util::filename_cat($base_dir,$file) if $base_dir =~ m/\w/; 507 507 508 508 # get the tidied file … … 572 572 573 573 # read in file ($text will be in utf8) 574 my $text = ""; 575 $self->read_file ($filename_full_path, $encoding, $language, \$text); 576 my $textref = \$text; 574 my $raw_text = ""; 575 $self->read_file_no_decoding ($filename_full_path, \$raw_text); 576 577 my $textref = \$raw_text; 577 578 my $opencom = '(?:<!--|<!(?:—|—|--))'; 578 579 my $closecom = '(?:-->|(?:—|—|--)>)'; … … 587 588 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs); 588 589 590 if(!defined $self->{'utf8_to_original_filename'}) { 591 # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url 592 $self->{'utf8_to_original_filename'} = {}; 593 } 594 589 595 foreach my $link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) { 590 596 … … 605 611 } 606 612 $link = $self->eval_dir_dots($link); 607 608 # Replace %XX's in URL with decoded value if required.609 # Note that the filename may include the %XX in some situations610 if ($link =~ m/\%[A-F0-9]{2}/i) {611 if (!-e $link) {612 $link =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige;613 }614 }615 613 616 $block_hash->{'file_blocks'}->{$link} = 1; 617 } 614 615 # this is the actual filename on the filesystem (that the link refers to) 616 my $url_original_filename = $self->opt_url_decode($link); 617 618 # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename 619 my $utf8_link = ""; 620 $self->decode_text($link,$encoding,$language,\$utf8_link); 621 ### my $utf8_url_encoded_link = &unicode::url_encode($utf8_link); 622 623 $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename; 624 ### $self->{'utf8_to_original_filename'}->{$utf8_url_encoded_link} = $url_original_filename; 625 626 # print STDERR "**** Storing block link: $link\n"; 627 # print STDERR "**** URL original filename: $url_original_filename\n"; 628 print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n"; 629 630 $block_hash->{'file_blocks'}->{$url_original_filename} = 1; 631 } 632 } 633 634 # Given a filename in any encoding, will URL decode it to get back the original filename 635 # in the original encoding. Because this method is intended to work out the *original* 636 # filename*, it not URL decode any filename if a file by the name of the *URL-encoded* 637 # string already exists in the local folder. 638 # Return the original filename corresponding to the parameter URL-encoded filename, and 639 # a decoded flag that is set to true iff URL-decoding had to be applied. 640 sub opt_url_decode { 641 my $self = shift (@_); 642 my ($link) = @_; 643 644 # Replace %XX's in URL with decoded value if required. 645 # Note that the filename may include the %XX in some situations 646 if ($link =~ m/\%[A-F0-9]{2}/i) { 647 if (!-e $link) { 648 $link = &unicode::url_decode($link); 649 } 650 } 651 652 return $link; 618 653 } 619 654 … … 625 660 my $outhandle = $self->{'outhandle'}; 626 661 627 if ($ENV{'GSDLOS'} =~ /^windows/i) {662 if ($ENV{'GSDLOS'} =~ m/^windows/i) { 628 663 # this makes life so much easier... perl can cope with unix-style '/'s. 629 664 $base_dir =~ s@(\\)+@/@g; … … 672 707 # links, so even if 'file_is_url' is off, still need to store info 673 708 709 # print STDERR "#### file: $file\n"; 674 710 my ($tailname,$dirname,$suffix) = &File::Basename::fileparse($file, "\\.[^\\.]+\$"); 675 711 my $utf8_file = $self->filename_to_utf8_metadata($file); 712 # $utf8_file = &unicode::url_encode($utf8_file); 713 676 714 my $web_url = "http://"; 677 715 if(defined $dirname) { # local directory … … 680 718 $web_url = $web_url.$utf8_file; 681 719 } 720 # print STDERR "#### weburl: $web_url\n"; 721 682 722 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url); 683 723 … … 728 768 $found_something = 1; 729 769 $cursection = $doc_obj->get_parent_section ($cursection); 730 } elsif ($tag =~ /^Metadata name=$quot(.*?)$quot/s) {770 } elsif ($tag =~ m/^Metadata name=$quot(.*?)$quot/s) { 731 771 my $metaname = $1; 732 my $accumulate = $tag =~ /mode=${quot}accumulate${quot}/ ? 1 : 0;772 my $accumulate = $tag =~ m/mode=${quot}accumulate${quot}/ ? 1 : 0; 733 773 $comment =~ s/^(.*?)$lt\/Metadata$gt//s; 734 774 my $metavalue = $1; … … 743 783 $metavalue =~ s/[\cJ\cM]/ /sg; 744 784 $metavalue =~ s/<[^>]+>//sg 745 unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ /^($dont_strip)$/);785 unless $dont_strip && ($dont_strip eq 'all' || $metaname =~ m/^($dont_strip)$/); 746 786 $metavalue =~ s/\s+/ /sg; 747 787 if ($accumulate) { … … 765 805 $$textref =~ s/^.*?<body[^>]*>//is; 766 806 $$textref =~ s/(<\/body[^>]*>|<\/html[^>]*>)//isg; 767 if ($$textref =~ /\S/) {807 if ($$textref =~ m/\S/) { 768 808 if (!$found_something) { 769 809 if ($self->{'verbosity'} > 2) { … … 891 931 if (!$self->{'nolinks'}) { 892 932 # usemap="./#index" not handled correctly => change to "#index" 893 $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/ 933 ## $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/ 934 935 $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/ 894 936 $self->replace_usemap_links($1, $2, $3)/isge; 895 937 896 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/ 938 ## $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/ 939 940 $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/ 897 941 $self->replace_href_links ($1, $2, $3, $base_dir, $file, $doc_obj, $cursection)/isge; 898 942 } … … 951 995 # absolute paths for the images, and without the "file://" prefix 952 996 # So check for this special case and massage the data to be correct 953 if ($ENV{'GSDLOS'} =~ /^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~/^[A-Za-z]\:\\/) {997 if ($ENV{'GSDLOS'} =~ m/^windows/i && $self->{'plugin_type'} eq "WordPlug" && $link =~ m/^[A-Za-z]\:\\/) { 954 998 $link =~ s/^.*\\([^\\]+)$/$1/; 955 999 } … … 959 1003 my $img_file = $self->add_file ($href, $rl, $hash_part, $base_dir, $doc_obj, $section); 960 1004 961 print STDERR "**** link = $link\n";962 print STDERR "**** href = $href\n";963 print STDERR "**** img_file = $img_file\n";1005 # print STDERR "**** link = $link\n"; 1006 # print STDERR "**** href = $href\n"; 1007 # print STDERR "**** img_file = $img_file\n"; 964 1008 965 1009 my $anchor_name = $img_file; … … 981 1025 my $self = shift (@_); 982 1026 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_; 1027 1028 # remove quotes from link at start and end if necessary 1029 if ($link=~/^[\"\']/) { 1030 $link=~s/^[\"\']//; 1031 $link=~s/[\"\']$//; 1032 $front.='"'; 1033 $back="\"$back"; 1034 } 983 1035 984 1036 # attempt to sort out targets - frames are not handled … … 991 1043 $back =~ s/target=\"?_parent\"?//is; 992 1044 993 return $front . $link . $back if $link =~ /^\#/s;1045 return $front . $link . $back if $link =~ m/^\#/s; 994 1046 $link =~ s/\n/ /g; 995 1047 1048 # Find file referred to by $link on file system 1049 # This is more complicated than it sounds when char encodings 1050 # is taken in to account 1051 ## &unicode::ensure_utf8(\$link); 1052 ## $link = &unicode::url_encode($link); 1053 # print STDERR "#### filepath: ".&util::filename_cat($base_dir,$file)."\n"; 1054 # print STDERR "#### link: $link\n"; 1055 996 1056 my ($href, $hash_part, $rl) = $self->format_link ($link, $base_dir, $file); 1057 # print STDERR "#### href: $href\n"; 1058 997 1059 # href may use '\'s where '/'s should be on Windows 998 1060 $href =~ s/\\/\//g; 999 1061 1000 my ($filename) = $href =~ /^(?:.*?):(?:\/\/)?(.*)/; 1062 ## $href = &unicode::url_decode($href); 1063 # print STDERR "#### href again: $href\n"; 1064 my ($filename) = $href =~ m/^(?:.*?):(?:\/\/)?(.*)/; 1001 1065 1002 1066 … … 1010 1074 ##### the intermediate page) in the top level window - I'm not sure if that's 1011 1075 ##### possible - the following line should probably be deleted if that can be done 1012 return $front . $link . $back if $href =~ /^(mailto|news|gopher|nntp|telnet|javascript):/is;1013 1014 1015 if (($rl == 0) || ($filename =~ /$self->{'process_exp'}/) ||1016 ($href =~ /\/$/) || ($href =~/^(mailto|news|gopher|nntp|telnet|javascript):/i)) {1076 return $front . $link . $back if $href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/is; 1077 1078 1079 if (($rl == 0) || ($filename =~ m/$self->{'process_exp'}/) || 1080 ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) { 1017 1081 &ghtml::urlsafe ($href); 1018 1082 return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back; … … 1040 1104 1041 1105 $filename = &util::filename_cat($base_dir, $filename); 1042 1043 # Replace %XX's in URL with decoded value if required. 1044 # Note that the filename may include the %XX in some situations 1045 if ($filename =~ m/\%[A-F0-9]{2}/i) { 1046 if (!-e $filename) { 1047 $filename =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige; 1048 } 1049 } 1050 1051 my ($ext) = $filename =~ /(\.[^\.]*)$/; 1106 # print STDERR "**** filename: $filename\n"; 1107 # Replace %XX's in URL with decoded value if required. Note that the filename may include the %XX in some 1108 # situations. If the *original* file's name was in URL encoding, the following method will not decode it. 1109 my $utf8_filename = $filename; 1110 1111 # print STDERR "*** filename before URL decoding: $filename\n"; 1112 $filename = $self->opt_url_decode($utf8_filename); 1113 # print STDERR "*** filename after URL decoding: $filename\n\n"; 1114 1115 # some special processing if the intended filename was converted to utf8, but 1116 # the actual file still needs to be renamed 1117 if (!-e $filename) { 1118 # try the original filename stored in map 1119 my $original_filename = $self->{'utf8_to_original_filename'}->{$filename}; 1120 if (-e $original_filename) { 1121 $filename = $original_filename; 1122 } 1123 } 1124 1125 my ($ext) = $filename =~ m/(\.[^\.]*)$/; 1052 1126 1053 1127 if ($rl == 0) { 1054 if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) {1128 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) { 1055 1129 return "_httpextlink_&rl=0&el=prompt&href=" . $href . $hash_part; 1056 1130 } … … 1060 1134 } 1061 1135 1062 if ((!defined $ext) || ($ext !~ /$self->{'assoc_files'}/)) {1136 if ((!defined $ext) || ($ext !~ m/$self->{'assoc_files'}/)) { 1063 1137 return "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part; 1064 1138 } … … 1075 1149 return "_httpdocimg_/$newname"; 1076 1150 } else { 1077 ($newname) = $filename =~ /([^\/\\]*)$/; 1078 # Make sure this name is a valid utf8 filename 1079 ## &unicode::ensure_utf8(\$newname); 1080 $newname =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg; 1151 ($newname) = $utf8_filename =~ m/([^\/\\]*)$/; 1152 1153 # print STDERR "Before url encoding newname: $newname\n"; 1154 # Make sure this name uses only ASCII characters 1155 # => use URL encoding, as it preserves original encoding 1156 $newname = &unicode::url_encode($newname); 1157 # print STDERR "After url encoding newname: $newname\n"; 1158 # print STDERR "*** Real name and converted filename:\n\t$filename\n\t$newname\n"; 1081 1159 1082 1160 $doc_obj->associate_file($filename, $newname, undef, $section); 1083 1161 1162 # Since the generated image will be URL-encoded to avoid file-system/browser mess-ups 1163 # of filenames, URL-encode the additional percent signs of the URL-encoded filename 1084 1164 my $newname_url = $newname; 1085 1165 $newname_url =~ s/%/%25/g; 1086 return "_httpdocimg_/$newname_url"; 1087 } 1088 } 1089 1166 return "_httpdocimg_/$newname_url"; 1167 } 1168 } 1090 1169 1091 1170 … … 1094 1173 my ($link, $base_dir, $file) = @_; 1095 1174 1096 my ($before_hash, $hash_part) = $link =~ /^([^\#]*)(\#?.*)$/;1175 my ($before_hash, $hash_part) = $link =~ m/^([^\#]*)(\#?.*)$/; 1097 1176 1098 1177 $hash_part = "" if !defined $hash_part; 1099 if (!defined $before_hash || $before_hash !~ /[\w\.\/]/) {1178 if (!defined $before_hash || $before_hash !~ m/[\w\.\/]/) { 1100 1179 my $outhandle = $self->{'outhandle'}; 1101 1180 print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n" … … 1107 1186 my $type = $1; 1108 1187 1109 if ($link =~ /^(http|ftp):/i) {1188 if ($link =~ m/^(http|ftp):/i) { 1110 1189 # Turn url (using /) into file name (possibly using \ on windows) 1111 1190 my @http_dir_split = split('/', $before_hash); … … 1121 1200 1122 1201 # make sure there's a slash on the end if it's a directory 1123 if ($before_hash !~ /\/$/) {1202 if ($before_hash !~ m/\/$/) { 1124 1203 $before_hash .= "/" if (-d $linkfilename); 1125 1204 } 1126 1127 1205 return ($type . $before_hash, $hash_part, $rl); 1128 1206 1129 } elsif ($link !~ /^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ /^\//) { 1130 if ($before_hash =~ s@^/@@ || $before_hash =~ /\\/) { 1207 } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) { 1208 1209 if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) { 1131 1210 1132 1211 # the first directory will be the domain name if file_is_url … … 1145 1224 # => turn into relative link if this is so! 1146 1225 1147 if ($ENV{'GSDLOS'} =~ /^windows/i) {1226 if ($ENV{'GSDLOS'} =~ m/^windows/i) { 1148 1227 # too difficult doing a pattern match with embedded '\'s... 1149 1228 my $win_before_hash=$before_hash; … … 1168 1247 my $dirname = &File::Basename::dirname($file); 1169 1248 $before_hash = &util::filename_cat($dirname, $before_hash); 1170 $before_hash = $self->eval_dir_dots($before_hash); 1249 $before_hash = $self->eval_dir_dots($before_hash); 1250 1251 # print STDERR "#### before_hash: $before_hash\n"; 1171 1252 } 1172 1253 1173 1254 my $linkfilename = &util::filename_cat ($base_dir, $before_hash); 1174 1255 # make sure there's a slash on the end if it's a directory 1175 if ($before_hash !~ /\/$/) {1256 if ($before_hash !~ m/\/$/) { 1176 1257 $before_hash .= "/" if (-d $linkfilename); 1177 1258 } … … 1224 1305 1225 1306 # support tag<tagname> 1226 if ($field =~ /^(.*?)<(.*?)>$/) {1307 if ($field =~ m/^(.*?)<(.*?)>$/) { 1227 1308 # "$2" is the user's preferred gs metadata name 1228 1309 $find_fields{lc($1)}=$2; # lc = lowercase … … 1262 1343 my $found_title = 0; 1263 1344 # this assumes that ">" won't appear. (I don't think it's allowed to...) 1264 $html_header =~ /^/; # match the start of the string, for \G assertion1345 $html_header =~ m/^/; # match the start of the string, for \G assertion 1265 1346 1266 1347 while ($html_header =~ m/\G.*?<meta(.*?)>/sig) { … … 1269 1350 1270 1351 # find the tag name 1271 $metatag =~ /(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is;1352 $metatag =~ m/(?:name|http-equiv)\s*=\s*([\"\'])?(.*?)\1/is; 1272 1353 $tag=$2; 1273 1354 # in case they're not using " or ', but they should... 1274 1355 if (! $tag) { 1275 $metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;1356 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is; 1276 1357 $tag=$1; 1277 1358 } … … 1287 1368 1288 1369 # find the tag content 1289 $metatag =~ /content\s*=\s*([\"\'])?(.*?)\1/is;1370 $metatag =~ m/content\s*=\s*([\"\'])?(.*?)\1/is; 1290 1371 $value=$2; 1291 1372 1292 1373 if (! $value) { 1293 $metatag =~ /(?:name|http-equiv)\s*=\s*([^\s\>]+)/is;1374 $metatag =~ m/(?:name|http-equiv)\s*=\s*([^\s\>]+)/is; 1294 1375 $value=$1; 1295 1376 } … … 1316 1397 print $outhandle " extracted \"$tag\" metadata \"$value\"\n" 1317 1398 if ($self->{'verbosity'} > 2); 1318 if ($tag =~ /date.*/i){1399 if ($tag =~ m/date.*/i){ 1319 1400 $tag = lc($tag); 1320 1401 } … … 1329 1410 my $title; 1330 1411 my $from = ""; # for debugging output only 1331 if ($html_header =~ /<title[^>]*>([^<]+)<\/title[^>]*>/is) {1412 if ($html_header =~ m/<title[^>]*>([^<]+)<\/title[^>]*>/is) { 1332 1413 $title = $1; 1333 1414 $from = "<title> tags"; … … 1368 1449 1369 1450 foreach my $field (keys %find_fields) { 1370 if ($field !~ /^tag([a-z0-9]+)$/i) {next}1451 if ($field !~ m/^tag([a-z0-9]+)$/i) {next} 1371 1452 my $tag = $1; 1372 1453 if ($$textref =~ m@<$tag[^>]*>(.*?)</$tag[^>]*>@g) { … … 1430 1511 my ($front, $link, $back) = @_; 1431 1512 1513 # remove quotes from link at start and end if necessary 1514 if ($link=~/^[\"\']/) { 1515 $link=~s/^[\"\']//; 1516 $link=~s/[\"\']$//; 1517 $front.='"'; 1518 $back="\"$back"; 1519 } 1520 1432 1521 $link =~ s/^\.\///; 1433 1522 return $front . $link . $back;
Note:
See TracChangeset
for help on using the changeset viewer.