- Timestamp:
- 2010-12-06T13:15:10+13:00 (13 years ago)
- Location:
- main/trunk/greenstone2/perllib
- Files:
-
- 8 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/basebuildproc.pm
r23371 r23387 515 515 $field =~ s/^ex\.//; 516 516 517 # special case for U RL metadata518 if ($field =~ m/^U RL$/i) {517 # special case for UTF8URL metadata 518 if ($field =~ m/^UTF8URL$/i) { 519 519 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, 520 520 $value, { 'section' => [ $section_OID ] }); -
main/trunk/greenstone2/perllib/doc.pm
r23362 r23387 1089 1089 $self->delete_assoc_file ($assoc_filename); 1090 1090 1091 if (!&util::fd_exists($real_filename)) { 1092 print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n"; 1093 exit -1; 1094 } 1095 # print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n"; 1096 # print STDERR "****##### so, ensure it is before storing?!?!?\n"; 1097 ## my $utf8_filename = Encode::encode("utf8",$filename); 1098 1091 1099 push (@{$self->{'associated_files'}}, 1092 1100 [$real_filename, $assoc_filename, $mime_type, $section]); -
main/trunk/greenstone2/perllib/plugins/BasePlugin.pm
r23364 r23387 927 927 928 928 # UTF-8 version of filename 929 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {930 print STDERR "**** Setting Source Metadata given: $octet_file\n";931 }929 # if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 930 # print STDERR "**** Setting Source Metadata given: $octet_file\n"; 931 # } 932 932 933 933 # Deal with (on Windows) raw filenames that are in their … … 937 937 if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) { 938 938 if (-e $raw_filename) { 939 require Win32;940 941 ## print STDERR "**** raw filename before LPN: $raw_filename\n";942 939 my $unicode_filename = Win32::GetLongPathName($raw_filename); 943 940 944 941 my $unused_full_uf; 945 942 ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename); 946 947 ## print STDERR "**** raw filename after LPN: $raw_filename\n";948 943 } 949 944 } … … 962 957 } 963 958 964 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {965 print STDERR "****** saving Source as: $url_encoded_filename\n";966 }959 # if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 960 # print STDERR "****** saving Source as: $url_encoded_filename\n"; 961 # } 967 962 968 963 … … 978 973 $renamed_raw_url); 979 974 980 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {981 print STDERR "****** saving SourceFile as: $renamed_raw_url\n";982 }975 # if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 976 # print STDERR "****** saving SourceFile as: $renamed_raw_url\n"; 977 # } 983 978 } 984 979 -
main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm
r23363 r23387 226 226 my $convert_to_ext = $self->{'convert_to_ext'}; 227 227 228 229 my $upgraded_input_filename = &util::upgrade_if_dos_filename($input_filename); 230 228 231 # derive tmp filename from input filename 229 232 my ($tailname, $dirname, $suffix) 230 = &File::Basename::fileparse($ input_filename, "\\.[^\\.]+\$");233 = &File::Basename::fileparse($upgraded_input_filename, "\\.[^\\.]+\$"); 231 234 232 235 # softlink to collection tmp dir … … 238 241 } 239 242 240 # convert to utf-8 otherwise we have problems with the doc.xml file later on 241 # print STDERR "**** filename $tailname$suffix is already UTF8\n" if &unicode::check_is_utf8($tailname); 242 $tailname = $self->SUPER::filepath_to_utf8($tailname) unless &unicode::check_is_utf8($tailname); 243 # # convert to utf-8 otherwise we have problems with the doc.xml file later on 244 # my $utf8_tailname = (&unicode::check_is_utf8($tailname)) ? $tailname : $self->filepath_to_utf8($tailname); 245 246 # make sure filename to be used can be stored OK in a UTF-8 compliant doc.xml file 247 my $utf8_tailname = &unicode::raw_filename_to_utf8_url_encoded($tailname); 248 243 249 244 250 # URLEncode this since htmls with images where the html filename is utf8 don't seem 245 251 # to work on Windows (IE or Firefox), as browsers are looking for filesystem-encoded 246 252 # files on the filesystem. 247 $ tailname = &util::rename_file($tailname, $self->{'file_rename_method'}, "without_suffix");248 249 $suffix = lc($suffix);250 my $tmp_filename = &util::filename_cat($tmp_dirname, "$ tailname$suffix");253 $utf8_tailname = &util::rename_file($utf8_tailname, $self->{'file_rename_method'}, "without_suffix"); 254 255 my $lc_suffix = lc($suffix); 256 my $tmp_filename = &util::filename_cat($tmp_dirname, "$utf8_tailname$lc_suffix"); 251 257 252 258 # If gsdl is remote, we're given relative path to input file, of the form import/tailname.suffix … … 324 330 } 325 331 } else { 326 $output_filename =~ s/$ suffix$/.$output_type/;332 $output_filename =~ s/$lc_suffix$/.$output_type/; 327 333 } 328 334 … … 446 452 # need to check that not empty 447 453 my ($doc_ext) = $file =~ /\.(\w+)$/; 454 $doc_ext = lc($doc_ext); 448 455 my $file_type = "unknown"; 449 456 $file_type = $self->{'file_type'} if defined $self->{'file_type'}; … … 458 465 $assocfilename = $doc_obj->get_assocfile_from_sourcefile(); 459 466 } 467 460 468 $doc_obj->associate_file($filename, $assocfilename, undef, $cursection); 461 469 … … 476 484 my $tmp_dir = $self->{'tmp_dir'}; 477 485 if (defined $tmp_dir && -d $tmp_dir) { 486 ## print STDERR "**** Supressing clean up of tmp dir\n"; 478 487 &util::rm_r($tmp_dir); 479 488 $self->{'tmp_dir'} = undef; -
main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm
r23371 r23387 37 37 38 38 use Encode; 39 use Unicode::Normalize 'normalize'; 39 40 40 41 use ReadTextFile; … … 206 207 my @script_matches = ($$textref =~ m/<script[^>]*?src\s*=\s*($attval)[^>]*>/igs); 207 208 208 if(!defined $self->{'u tf8_to_original_filename'}) {209 if(!defined $self->{'unicode_to_original_filename'}) { 209 210 # maps from utf8 converted link name -> original filename referrred to by (possibly URL-encoded) src url 210 $self->{'u tf8_to_original_filename'} = {};211 } 212 213 foreach my $ link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) {211 $self->{'unicode_to_original_filename'} = {}; 212 } 213 214 foreach my $raw_link (@img_matches, @usemap_matches, @link_matches, @embed_matches, @tabbg_matches, @script_matches) { 214 215 215 216 # remove quotes from link at start and end if necessary 216 if ($ link =~ m/^\"/) {217 $ link =~ s/^\"//;218 $ link =~ s/\"$//;217 if ($raw_link =~ m/^\"/) { 218 $raw_link =~ s/^\"//; 219 $raw_link =~ s/\"$//; 219 220 } 220 221 221 222 # remove any anchor names, e.g. foo.html#name becomes foo.html 222 223 # but watch out for any #'s that are part of entities, such as α 223 $ link =~ s/([^&])\#.*$/$1/s;224 $raw_link =~ s/([^&])\#.*$/$1/s; 224 225 225 226 # some links may just be anchor names 226 next unless ($ link =~ /\S+/);227 228 if ($ link !~ m@^/@ && $link !~ m/^([A-Z]:?)\\/) {227 next unless ($raw_link =~ /\S+/); 228 229 if ($raw_link !~ m@^/@ && $raw_link !~ m/^([A-Z]:?)\\/) { 229 230 # Turn relative file path into full path 230 231 my $dirname = &File::Basename::dirname($filename_full_path); 231 $ link = &util::filename_cat($dirname, $link);232 } 233 $ link = $self->eval_dir_dots($link);232 $raw_link = &util::filename_cat($dirname, $raw_link); 233 } 234 $raw_link = $self->eval_dir_dots($raw_link); 234 235 235 236 # this is the actual filename on the filesystem (that the link refers to) 236 my $url_original_filename = $self->opt_url_decode($link); 237 238 # Convert the url_original_filename into its utf8 version. Store the utf8 link along with the url_original_filename 239 my $utf8_link = ""; 240 $self->decode_text($link,$content_encoding,$language,\$utf8_link); 241 242 $self->{'utf8_to_original_filename'}->{$utf8_link} = $url_original_filename; 243 # print STDERR "**** utf8_encoded_link to original src filename:\n\t$utf8_link\n\t".$self->{'utf8_to_original_filename'}->{$utf8_link}."\n"; 244 245 if ($url_original_filename ne $utf8_link) { 237 my $url_original_filename = $self->opt_url_decode($raw_link); 238 239 my ($uses_bytecodes,$exceeds_bytecodes) = &unicode::analyze_raw_string($url_original_filename); 240 241 if ($exceeds_bytecodes) { 242 # We have a link to a file name that is more complicated than a raw byte filename 243 # What we do next depends on the operating system we are on 244 245 if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) { 246 # Assume we're dealing with a UTF-8 encoded filename 247 $url_original_filename = encode("utf8", $url_original_filename); 248 } 249 elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) { 250 # HFS+ is UTF8 with decompostion 251 $url_original_filename = encode("utf8", $url_original_filename); 252 $url_original_filename = normalize('D', $url_original_filename); # Normalization Form D (decomposition) 253 } 254 elsif ($ENV{'GSDLOS'} =~ /^windows$/i) { 255 # Don't need to do anything as later code maps Windows 256 # unicode filenames to DOS short filenames when needed 257 } 258 else { 259 my $outhandle = $self->{'outhandle'}; 260 print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n"; 261 print $outhandle " in raw file system encoding of: $raw_link\n"; 262 print $outhandle " Assuming filesystem is UTF-8 based.\n"; 263 $url_original_filename = encode("utf8", $url_original_filename); 264 } 265 } 266 267 # Convert the (currently raw) link into its Unicode version. 268 # Store the Unicode link along with the url_original_filename 269 my $unicode_url_original_filename = ""; 270 $self->decode_text($raw_link,$content_encoding,$language,\$unicode_url_original_filename); 271 272 273 $self->{'unicode_to_original_filename'}->{$unicode_url_original_filename} = $url_original_filename; 274 275 276 if ($url_original_filename ne $unicode_url_original_filename) { 246 277 my $outhandle = $self->{'outhandle'}; 247 278 248 279 print $outhandle "URL Encoding $url_original_filename\n"; 249 print $outhandle " ->$utf8_link\n"; 250 } 251 252 253 my $unicode_url_original_filename = decode("utf8",$url_original_filename); 254 255 ## print STDERR "*****!!! Blocking url original filename = $unicode_url_original_filename\n"; 256 257 # Allow for possibility of raw byte version (UTF8) and Unicode versions of file 280 print $outhandle " ->$unicode_url_original_filename\n"; 281 282 # Allow for possibility of raw byte version and Unicode versions of file 283 $block_hash->{'file_blocks'}->{$unicode_url_original_filename} = 1; 284 285 } 286 258 287 $block_hash->{'file_blocks'}->{$url_original_filename} = 1; 259 $block_hash->{'file_blocks'}->{$unicode_url_original_filename} = 1;260 288 } 261 289 } … … 266 294 # string already exists in the local folder. 267 295 # 268 # Is the following still true??269 # Return the original filename corresponding to the parameter URL-encoded filename, and270 # a decoded flag that is set to true iff URL-decoding had to be applied.271 296 sub opt_url_decode { 272 297 my $self = shift (@_); 273 my ($link) = @_; 298 my ($raw_link) = @_; 299 274 300 275 301 # Replace %XX's in URL with decoded value if required. 276 302 # Note that the filename may include the %XX in some situations 277 if ($link =~ m/\%[A-F0-9]{2}/i) { 278 if (!-e $link) { 279 $link = &unicode::url_decode($link); 303 304 ## if ($raw_link =~ m/\%[A-F0-9]{2}/i) { 305 306 if (($raw_link =~ m/\%[A-F0-9]{2}/i) || ($raw_link =~ m/\&\#x[0-9A-F]+;/i) || ($raw_link =~ m/\&\#[0-9]+;/i)) { 307 if (!-e $raw_link) { 308 $raw_link = &unicode::url_decode($raw_link,1); 280 309 } 281 310 } 282 283 return $ link;311 312 return $raw_link; 284 313 } 285 314 … … 367 396 368 397 # Need to make sure there is a '/' on the end of upgraded_base_dir 369 if ( $upgraded_base_dir !~ m/\/$/) {398 if (($upgraded_base_dir ne "") && ($upgraded_base_dir !~ m/\/$/)) { 370 399 $upgraded_base_dir .= "/"; 371 400 } … … 418 447 # my $utf8_file = $self->filename_to_utf8_metadata($file); 419 448 # $utf8_file =~ s/&\#095;/_/g; 420 my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname); 421 449 # variable below used to be utf8_file 450 my $url_encoded_file = &unicode::raw_filename_to_url_encoded($tailname); 451 my $utf8_url_encoded_file = &unicode::raw_filename_to_utf8_url_encoded($tailname); 422 452 423 453 my $web_url = "http://"; 454 my $utf8_web_url = "http://"; 424 455 if(defined $dirname) { # local directory 425 456 # Check for "ftp" in the domain name of the directory … … 432 463 { 433 464 $web_url = "ftp://"; 465 $utf8_web_url = "ftp://"; 434 466 } 435 467 $dirname = $self->eval_dir_dots($dirname); 436 468 $dirname .= &util::get_dirsep() if $dirname ne ""; # if there's a directory, it should end on "/" 437 $web_url = $web_url.$dirname.$utf8_file; 469 470 $web_url = $web_url.$dirname.$url_encoded_file; 471 $utf8_web_url = $utf8_web_url.$dirname.$utf8_url_encoded_file; 438 472 } else { 439 $web_url = $web_url.$utf8_file; 473 $web_url = $web_url.$url_encoded_file; 474 $utf8_web_url = $utf8_web_url.$utf8_url_encoded_file; 440 475 } 441 476 $web_url =~ s/\\/\//g; 477 $utf8_web_url =~ s/\\/\//g; 442 478 443 479 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 444 print STDERR "******* upgraded_file: $upgraded_file\n";445 print STDERR "******* adding URL metadata: $utf8_file\n";480 print STDERR "*******DEBUG: upgraded_file: $upgraded_file\n"; 481 print STDERR "*******DEBUG: adding URL metadata: $utf8_url_encoded_file\n"; 446 482 } 447 483 448 484 449 485 $doc_obj->add_utf8_metadata($cursection, "URL", $web_url); 486 $doc_obj->add_utf8_metadata($cursection, "UTF8URL", $utf8_web_url); 450 487 451 488 if ($self->{'file_is_url'}) { … … 590 627 } 591 628 629 592 630 # single section document 593 631 $self->process_section($textref, $upgraded_base_dir, $upgraded_file, $doc_obj, $cursection); … … 664 702 ## $$textref =~ s/(<img[^>]*?usemap\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/ 665 703 704 my $opencom = '(?:<!--|<!(?:—|—|--))'; 705 my $closecom = '(?:-->|(?:—|—|--)>)'; 706 666 707 $$textref =~ s/(<img[^>]*?usemap\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/ 667 708 $self->replace_usemap_links($1, $2, $3)/isge; 668 709 669 710 ## $$textref =~ s/(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*[\"\']?)([^\"\'>\s]+)([\"\']?[^>]*>)/ 670 671 $$textref =~ s/( <(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)/672 $self->replace_href_links ($1, $2, $3, $ base_dir, $file, $doc_obj, $cursection)/isge;711 712 $$textref =~ s/($opencom.*?)?+(<(?:a|area|frame|link|script)\s+[^>]*?\s*(?:href|src)\s*=\s*)((?:[\"][^\"]+[\"])|(?:[\'][^\']+[\'])|(?:[^\s\/>]+))([^>]*>)(.*?$closecom)?+/ 713 $self->replace_href_links ($1, $2, $3, $4, $5, $base_dir, $file, $doc_obj, $cursection)/isge; 673 714 } 674 715 … … 753 794 sub replace_href_links { 754 795 my $self = shift (@_); 755 my ($front, $link, $back, $base_dir, $file, $doc_obj, $section) = @_; 796 my ($opt_open_comm, $front, $link, $back, $opt_close_comm, $base_dir, $file, $doc_obj, $section) = @_; 797 798 if ((defined $opt_open_comm) && (defined $opt_close_comm)) { 799 # href link was embedded in <!-- comments --> 800 ### print STDERR "****** Link in comment, skipping $link\n"; 801 return $opt_open_comm . $front . $link . $back . $opt_close_comm; 802 } 756 803 757 804 # remove quotes from link at start and end if necessary … … 819 866 } 820 867 821 $href = &unicode::raw_filename_to_u rl_encoded($href);868 $href = &unicode::raw_filename_to_utf8_url_encoded($href); 822 869 $href = &unicode::filename_to_url($href); 823 870 … … 825 872 826 873 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 827 print STDERR "****** href=$href\n";874 print STDERR "******DEBUG: href=$href\n"; 828 875 } 829 876 … … 844 891 my $filename = $href; 845 892 if ($base_dir eq "") { 846 # remove http:/ thereby leaving one slash at the start 847 $filename =~ s/^[^:]*:\///; 893 if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 894 # remove http:// 895 $filename =~ s/^[^:]*:\/\///; 896 } 897 else { 898 # remove http:/ thereby leaving one slash at the start as 899 # part of full pathname 900 $filename =~ s/^[^:]*:\///; 901 } 848 902 } 849 903 else { … … 863 917 # file's name was in URL encoding, the following method will not decode 864 918 # it. 865 my $utf8_filename = $filename; 866 my $opt_decode_utf8_filename = $self->opt_url_decode($utf8_filename); 919 my $unicode_filename = $filename; 920 my $opt_decode_unicode_filename = $self->opt_url_decode($unicode_filename); 921 922 # wvWare can generate <img src="StrangeNoGraphicData"> tags, but with no 923 # (it seems) accompanying file 924 if ($opt_decode_unicode_filename =~ m/StrangeNoGraphicData$/) { return ""; } 867 925 868 926 my $content_encoding= $self->{'content_encoding'} || "utf8"; 869 927 870 # The filenames that come through the HTML file have been decoded 871 # into Unicode aware Perl strings. Need to convert them back 872 # to their initial raw-byte encoding to match the file that 873 # exists on the file system 874 $filename = encode($content_encoding, $opt_decode_utf8_filename); 928 if ($ENV{'GSDLOS'} =~ /^(linux|solaris)$/i) { 929 # The filenames that come through the HTML file have been decoded 930 # into Unicode aware Perl strings. Need to convert them back 931 # to their initial raw-byte encoding to match the file that 932 # exists on the file system 933 $filename = encode($content_encoding, $opt_decode_unicode_filename); 934 } 935 elsif ($ENV{'GSDLOS'} =~ /^darwin$/i) { 936 # HFS+ is UTF8 with decompostion 937 $filename = encode($content_encoding, $opt_decode_unicode_filename); 938 $filename = normalize('D', $filename); # Normalization Form D (decomposition) 939 940 } 941 elsif ($ENV{'GSDLOS'} =~ /^windows$/i) { 942 my $long_filename = Win32::GetLongPathName($opt_decode_unicode_filename); 943 944 if (defined $long_filename) { 945 my $short_filename = Win32::GetLongPathName($long_filename); 946 $filename = $short_filename; 947 } 948 # else { 949 # print STDERR "***** failed to map href to real file:\n"; 950 # print STDERR "****** $href -> $opt_decode_unicode_filename\n"; 951 # } 952 } 953 else { 954 my $outhandle = $self->{'outhandle'}; 955 print $outhandle "Warning: Unrecognized operating system ", $ENV{'GSDLOS'}, "\n"; 956 print $outhandle " in file system encoding of href: $href\n"; 957 print $outhandle " No character encoding done.\n"; 958 } 959 875 960 876 961 # some special processing if the intended filename was converted to utf8, but … … 882 967 } 883 968 884 ## print STDERR "**** trying to look up u tf8_filename: $utf8_filename\n";885 886 my $original_filename = $self->{'u tf8_to_original_filename'}->{$utf8_filename};969 ## print STDERR "**** trying to look up unicode_filename: $unicode_filename\n"; 970 971 my $original_filename = $self->{'unicode_to_original_filename'}->{$unicode_filename}; 887 972 888 973 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 889 print STDERR "****** From lookup u tf8_filename, now trying for: $original_filename\n";974 print STDERR "****** From lookup unicode_filename, now trying for: $original_filename\n"; 890 975 } 891 976 … … 928 1013 return "_httpdocimg_/$newname"; 929 1014 } else { 930 if(&unicode::is_url_encoded($u tf8_filename)) {1015 if(&unicode::is_url_encoded($unicode_filename)) { 931 1016 # use the possibly-decoded filename instead to avoid double URL encoding 932 1017 ($newname) = $filename =~ m/([^\/\\]*)$/; 933 1018 } else { 934 ($newname) = $u tf8_filename =~ m/([^\/\\]*)$/;1019 ($newname) = $unicode_filename =~ m/([^\/\\]*)$/; 935 1020 } 936 1021 … … 1034 1119 1035 1120 my $linkfilename = &util::filename_cat ($base_dir, $before_hash); 1121 1122 1123 # print STDERR "**** linkfilename = $linkfilename\n"; 1124 # if (!&util::fd_exists($linkfilename)) { 1125 # print STDERR "***** Warning: Could not find $linkfilename\n"; 1126 # } 1127 1128 1036 1129 # make sure there's a slash on the end if it's a directory 1037 1130 if ($before_hash !~ m/\/$/) { 1038 1131 $before_hash .= "/" if (-d $linkfilename); 1039 1132 } 1133 1134 # print STDERR "*** returning: $before_hash\n"; 1135 1040 1136 return ("http://" . $before_hash, $hash_part, 1); 1041 1137 } else { -
main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm
r23363 r23387 307 307 $reader->set_encoding($encoding); 308 308 $reader->decode_text($raw_text,$textref); 309 310 # At this point $$textref is a binary byte string 311 # => turn it into a Unicode aware string, so full 312 # Unicode aware pattern matching can be used. 313 # For instance: 's/\x{0101}//g' or '[[:upper:]]' 314 315 $$textref = decode("utf8",$$textref); 309 316 } 310 317 } -
main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm
r23363 r23387 645 645 $utf8_real_filename =~ s/^\\(.*)/$1/i; 646 646 647 my $real_filename = &util::utf8_to_real_filename($utf8_real_filename); 647 ## my $real_filename = &util::utf8_to_real_filename($utf8_real_filename); 648 my $real_filename = $utf8_real_filename; 649 $real_filename = &util::downgrade_if_dos_filename($real_filename); 648 650 649 651 if (-e $real_filename) { … … 692 694 $real_filename =~ s/^\\(.*)/$1/i; 693 695 694 my $raw_filename; 695 if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 696 # Need to generate DOS version of filename to test in '-e $raw_filename' below 697 require Win32; 698 my $unicode_filename = Encode::decode("utf8",$real_filename); 699 ### print STDERR "***### files to field: ", &unicode::debug_unicode_string($unicode_filename),"\n"; 700 701 $raw_filename = Win32::GetShortPathName($unicode_filename); 702 } 703 else { 704 $raw_filename = $real_filename; 705 } 696 my $raw_filename = &util::downgrade_if_dos_filename($real_filename); 706 697 707 698 if (-e $raw_filename) { -
main/trunk/greenstone2/perllib/unicode.pm
r23371 r23387 627 627 628 628 sub url_decode { 629 my ($text ) = @_;629 my ($text,$and_numeric_entities) = @_; 630 630 631 631 $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige; 632 $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige; 633 $text =~ s/\&\#([0-9]+);/pack('C', $1)/ige; 632 633 if ((defined $and_numeric_entities) && ($and_numeric_entities)) { 634 $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige; 635 $text =~ s/\&\#([0-9]+);/pack('C', $1)/ige; 636 } 634 637 635 638 return $text; … … 773 776 } 774 777 775 776 778 sub url_encoded_to_raw_filename 777 779 { … … 787 789 } 788 790 791 792 sub raw_filename_to_utf8_url_encoded 793 { 794 my ($str_in) = @_; 795 796 $str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in); 797 798 my @url_encoded_chars 799 = map { $_ > 128 ? # Representable in %XX form 800 sprintf("%%%2X", $_) : 801 chr($_) # otherwise, Ascii char 802 } unpack("U*", $str_in); # Unpack utf8 characters 803 804 805 my $str_out = join("", @url_encoded_chars); 806 807 return $str_out; 808 809 } 810 811 sub utf8_url_encoded_to_raw_filename 812 { 813 my ($str_in) = @_; 814 815 my $utf8_str_out = $str_in; 816 817 $utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig; 818 819 my $unicode_str_out = decode("utf8",$utf8_str_out); 820 my $raw_str_out = utf8::downgrade($unicode_str_out); 821 822 return $raw_str_out; 823 } 824 825 sub analyze_raw_string 826 { 827 my ($str_in) = @_; 828 829 my $uses_bytecodes = 0; 830 my $exceeds_bytecodes = 0; 831 832 map { $exceeds_bytecodes = 1 if ($_ >= 256); 833 $uses_bytecodes = 1 if (($_ >= 128) && ($_ < 256)); 834 } unpack("U*", $str_in); # Unpack Unicode characters 835 836 return ($uses_bytecodes,$exceeds_bytecodes); 837 } 838 839 789 840 1;
Note:
See TracChangeset
for help on using the changeset viewer.