- Timestamp:
- 2017-02-23T14:49:41+13:00 (7 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm
r31415 r31440 186 186 187 187 my ($language, $content_encoding) = $self->textcat_get_language_encoding ($filename_full_path); 188 $self->{'store_content_encoding'}->{$filename_full_path} = $content_encoding; 189 190 # read in file ($text will be in utf8) 188 $self->{'store_content_encoding'}->{$filename_full_path} = [$content_encoding, $language]; 189 190 191 # read in file ($text will be in the filesystem encoding) 191 192 my $raw_text = ""; 192 193 $self->read_file_no_decoding($filename_full_path, \$raw_text); … … 296 297 &util::block_filename($block_hash,$url_original_filename) if $url_original_filename ne $html_fname; 297 298 298 299 # but only add the linked file to the blocklist if the current html file does not link to itself 299 300 300 301 } … … 332 333 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 333 334 334 # Lookup content_encoding worked out in file_block pass for this file335 # Store it under the local name 'content_encoding' so its nice and336 # easy to access337 $self->{' content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path};335 # Lookup content_encoding and language worked out in file_block pass for this file 336 # Store them under the local names they are nice and easy to access 337 $self->{'content_encoding'} = $self->{'store_content_encoding'}->{$filename_full_path}[0]; 338 $self->{'language'} = $self->{'store_content_encoding'}->{$filename_full_path}[1]; 338 339 339 340 # get the input file … … 463 464 my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname); 464 465 my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname); 465 466 466 467 my $web_url = "http://"; 467 468 my $utf8_web_url = "http://"; 469 468 470 if(defined $dirname) { # local directory 471 469 472 # Check for "ftp" in the domain name of the directory 470 473 # structure to determine if this URL should be a ftp:// URL … … 481 484 $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/" 482 485 483 $web_url = $web_url.$dirname.$url_encoded_file; 484 $utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file; 486 # this local directory in import may need to be URL encoded like the file 487 my $url_encoded_dir = &unicode::raw_filename_to_url_encoded($dirname); 488 my $utf8_url_encoded_dir = &unicode::raw_filename_to_utf8_url_encoded($dirname); 489 490 # changed here 491 $web_url = $web_url.$url_encoded_dir.$url_encoded_file; 492 $utf8_web_url = $utf8_web_url.$utf8_url_encoded_dir.$utf8_url_encoded_file; 485 493 } else { 486 494 $web_url = $web_url.$url_encoded_file; … … 493 501 print STDERR "*******DEBUG: upgraded_file: $upgraded_file\n"; 494 502 print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n"; 503 print STDERR "*******DEBUG: web url: $web_url\n"; 504 print STDERR "*******DEBUG: utf8 web url: $utf8_web_url\n"; 495 505 } 496 506 … … 847 857 } 848 858 859 # can't remember adding this :-( must have had a reason though... 860 if ($link =~ /^\_http/ || $link =~ /^\_libraryname\_/) { 861 # assume it is a greenstone one and leave alone 862 return $front . $link . $back; 863 } 864 849 865 # attempt to sort out targets - frames are not handled 850 866 # well in this plugin and some cases will screw things … … 912 928 $href = encode($content_encoding,$href); 913 929 } 914 915 $href = &unicode::raw_filename_to_utf8_url_encoded($href); 930 931 $href = &unicode::raw_filename_to_utf8_url_encoded($href); 916 932 $href = &unicode::filename_to_url($href); 917 933 918 934 &ghtml::urlsafe ($href); 919 935 920 936 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 921 937 print STDERR "******DEBUG: href=$href\n"; 922 938 } 923 939 940 #TODO here 941 # if ($rl ==1) { 942 # have a relative link, we need to do URL encoding etc so it matches what has happened for that file 943 #$href = &util::rename_file($href, $self->{'file_rename_method'}); 944 # $href = &unicode::raw_filename_to_url_encoded($href); 945 # then, this might be url encoded, so we replace % with %25 946 # $href = &unicode::filename_to_url($href); 947 # print STDERR "DEBUG: url encoded href = $href\n"; 948 # } 924 949 925 950 return $front . "_httpextlink_&rl=" . $rl . "&href=" . $href . $hash_part . $back; … … 1022 1047 # exists on the file system 1023 1048 $filename = encode($content_encoding, $opt_decode_unicode_filename); 1049 1024 1050 } 1025 1051 elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) { … … 1051 1077 # some special processing if the intended filename was converted to utf8, but 1052 1078 # the actual file still needs to be renamed 1053 #if (!&util::fd_exists($filename)) {1054 1079 if (!&FileUtils::fileExists($filename)) { 1055 1080 # try the original filename stored in map … … 1071 1096 } 1072 1097 $filename = $original_filename; 1073 } 1098 } 1074 1099 } 1075 1100 … … 1160 1185 my $self = shift (@_); 1161 1186 my ($link, $base_dir, $file) = @_; 1162 1187 1163 1188 # strip off hash part, e.g. #foo, but watch out for any entities, e.g. α 1164 1189 my ($before_hash, $hash_part) = $link =~ m/^(.*?[^&])(\#.*)?$/; … … 1168 1193 my $outhandle = $self->{'outhandle'}; 1169 1194 print $outhandle "HTMLPlugin: ERROR - badly formatted tag ignored ($link)\n" 1170 1195 if $self->{'verbosity'}; 1171 1196 return ($link, "", 0); 1172 1197 } 1173 1198 1199 # my $dirname; 1174 1200 if ($before_hash =~ s@^((?:http|https|ftp|file|mms)://)@@i) { 1175 1176 1201 my $type = $1; 1202 my $before_hash_file = $before_hash; 1177 1203 1178 1204 if ($link =~ m/^(http|ftp):/i) { … … 1187 1213 my $before_hash_url = $before_hash_file; 1188 1214 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 1189 $before_hash_url =~ s@\\@\/@g; 1190 } 1191 1215 $before_hash_url =~ s@\\@\/@g; 1216 } 1217 1218 ######## TODO need to check this for encoding stufff 1192 1219 my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash_file); 1193 1220 print STDERR "chekcing for existence whether relative link or not $linkfilename\n"; 1194 1221 my $rl = 0; 1195 1222 $rl = 1 if (-e $linkfilename); 1196 1223 if (-e $linkfilename) { 1224 1225 print STDERR "DOES exist $linkfilename\n"; 1226 } else { 1227 print STDERR "DOESN'T exist $linkfilename\n"; 1228 } 1197 1229 # make sure there's a slash on the end if it's a directory 1198 1230 if ($before_hash_url !~ m/\/$/) { … … 1203 1235 } elsif ($link !~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i && $link !~ m/^\//) { 1204 1236 1237 #### TODO whst is this test doing??? 1205 1238 if ($before_hash =~ s@^/@@ || $before_hash =~ m/\\/) { 1206 1239 … … 1240 1273 } 1241 1274 } else { 1242 # Turn relative file path into full path 1275 1276 # Turn relative file path into full path (inside import dir) 1243 1277 my $dirname = &File::Basename::dirname($file); 1244 $before_hash = &FileUtils::filenameConcatenate($dirname, $before_hash); 1278 1279 # we want to add dirname (which is raw filesystem path) onto $before_hash, (which is perl unicode aware string). Convert dirname to perl string 1280 1281 my $unicode_dirname =""; 1282 #my $content_encoding = $self->{'content_encoding'}; 1283 #my $language = $self->{'language'}; 1284 1285 # actually I think this is wrong. why should we use content encoding? 1286 #$self->decode_text($dirname, $content_encoding, $language, \$unicode_dirname); 1287 #my $filename_encoding = $self->{'filename_encoding'}; 1288 # filename_encoding might be auto... 1289 1290 # TODO what is the best thing to do here????? 1291 # try and guess default filesystem encoding, similar to deduce_filename_encoding, but without a file? 1292 my $filename_encoding = "utf8"; 1293 # copied this from set_Source_metadata in BasePlugin 1294 if ((defined $filename_encoding) && ($filename_encoding ne "ascii")) { 1295 # Use filename_encoding to map raw filename to a Perl unicode-aware string 1296 $unicode_dirname = decode($filename_encoding,$dirname); 1297 } 1298 else { 1299 # otherwise generate %xx encoded version of filename for char > 127 1300 $unicode_dirname = &unicode::raw_filename_to_url_encoded($dirname); 1301 } 1302 1303 $before_hash = &FileUtils::filenameConcatenate($unicode_dirname, $before_hash); 1245 1304 $before_hash = $self->eval_dir_dots($before_hash); 1246 $before_hash =~ s@\\@/@g; # for windows 1247 } 1248 1249 my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash); 1250 1251 1252 # print STDERR "**** linkfilename = $linkfilename\n"; 1253 # if (!&util::fd_exists($linkfilename)) { 1254 # print STDERR "***** Warning: Could not find $linkfilename\n"; 1255 # } 1256 1305 $before_hash =~ s@\\@/@g; # for windows 1306 } 1307 1308 my $linkfilename = &FileUtils::filenameConcatenate($base_dir, $before_hash); 1257 1309 1258 1310 # make sure there's a slash on the end if it's a directory … … 1260 1312 $before_hash .= "/" if (-d $linkfilename); 1261 1313 } 1262 1263 # print STDERR "*** returning: $before_hash\n";1264 1265 1314 return ("http://" . $before_hash, $hash_part, 1); 1266 1315 } else {
Note:
See TracChangeset
for help on using the changeset viewer.