Changeset 23363
- Timestamp:
- 2010-12-01T11:42:27+13:00 (13 years ago)
- Location:
- main/trunk/greenstone2/perllib
- Files:
-
- 8 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/BasePlugin.pm
r23352 r23363 414 414 my ($filename_full_path, $block_hash) = @_; 415 415 416 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path); 417 416 418 if ($self->{'block_exp'} ne "" && $filename_full_path =~ /$self->{'block_exp'}/) { 417 419 $block_hash->{'file_blocks'}->{$filename_full_path} = 1; … … 426 428 my ($filename, $block_hash) = @_; 427 429 430 $filename = &util::upgrade_if_dos_filename($filename); 431 428 432 if ($self->{'cover_image'}) { 429 433 my $coverfile = $filename; 430 434 $coverfile =~ s/\.[^\\\/\.]+$/\.jpg/; 431 if (! -e $coverfile) {435 if (!&util::fd_exists($coverfile)) { 432 436 $coverfile =~ s/jpg$/JPG/; 433 437 } 434 if ( -e $coverfile) {438 if (&util::fd_exists($coverfile)) { 435 439 $block_hash->{'file_blocks'}->{$coverfile} = 1; 436 440 } … … 915 919 # UTF-8 version of filename 916 920 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 917 print STDERR "**** **Setting Source Metadata given: $octet_file\n";921 print STDERR "**** Setting Source Metadata given: $octet_file\n"; 918 922 } 919 923 … … 950 954 951 955 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 952 print STDERR "***** saving Source as: $url_encoded_filename\n";956 print STDERR "****** saving Source as: $url_encoded_filename\n"; 953 957 } 954 958 … … 966 970 967 971 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 968 print STDERR "***** saving SourceFile as: $renamed_raw_url\n";972 print STDERR "****** saving SourceFile as: $renamed_raw_url\n"; 969 973 } 970 974 } … … 1014 1018 # should we move this to read? What about secondary plugins? 1015 1019 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli); 1016 print $outhandle "$self->{'plugin_type'} processing $file\n" 1020 my $pp_file = &util::prettyprint_file($base_dir,$file); 1021 print $outhandle "$self->{'plugin_type'} processing $pp_file\n" 1017 1022 if $self->{'verbosity'} > 1; 1018 1023 … … 1296 1301 my $zip_filename = $metadata->{$field}; 1297 1302 # overwrite the source_path 1298 $doc_obj-> {'source_path'} = $zip_filename;1303 $doc_obj->set_source_path($zip_filename); 1299 1304 # and set the metadata 1300 1305 $zip_filename = &util::filename_within_collection($zip_filename); … … 1337 1342 my ($doc_obj, $filename) = @_; 1338 1343 1344 my $upgraded_filename = &util::upgrade_if_dos_filename($filename); 1345 1339 1346 $filename =~ s/\.[^\\\/\.]+$/\.jpg/; 1340 if (exists $self->{'covers_missing_cache'}->{$filename}) { 1341 # don't stat() for existence eg for multiple document input files 1347 $upgraded_filename =~ s/\.[^\\\/\.]+$/\.jpg/; 1348 1349 if (exists $self->{'covers_missing_cache'}->{$upgraded_filename}) { 1350 # don't stat() for existence e.g. for multiple document input files 1342 1351 # (eg SplitPlug) 1343 1352 return; … … 1346 1355 my $top_section=$doc_obj->get_top_section(); 1347 1356 1348 if ( -e $filename) {1357 if (&util::fd_exists($upgraded_filename)) { 1349 1358 $doc_obj->associate_source_file($filename); 1350 1359 $doc_obj->associate_file($filename, "cover.jpg", "image/jpeg"); … … 1352 1361 } else { 1353 1362 my $upper_filename = $filename; 1363 my $upgraded_upper_filename = $upgraded_filename; 1364 1354 1365 $upper_filename =~ s/jpg$/JPG/; 1355 if (-e $upper_filename) { 1366 $upgraded_upper_filename =~ s/jpg$/JPG/; 1367 1368 if (&util::fd_exists($upgraded_upper_filename)) { 1356 1369 $doc_obj->associate_source_file($upper_filename); 1357 1370 $doc_obj->associate_file($upper_filename, "cover.jpg", … … 1361 1374 # file doesn't exist, so record the fact that it's missing so 1362 1375 # we don't stat() again (stat is slow) 1363 $self->{'covers_missing_cache'}->{$ filename} = 1;1376 $self->{'covers_missing_cache'}->{$upgraded_filename} = 1; 1364 1377 } 1365 1378 } -
main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm
r23352 r23363 394 394 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental 395 395 # build. so set it manually. 396 $doc_obj-> {'source_path'} = $filename_full_path;396 $doc_obj->set_source_path($filename_full_path); 397 397 $doc_obj->set_converted_filename($collect_conv_file); 398 398 -
main/trunk/greenstone2/perllib/plugins/DirectoryPlugin.pm
r23335 r23363 257 257 my ($block_hash, $filename_full_path) = @_; 258 258 259 $filename_full_path = &util::upgrade_if_dos_filename($filename_full_path); 260 ### print STDERR "*** DirectoryPlugin::file_is_blocked $filename_full_path\n"; 261 259 262 if (defined $block_hash->{'file_blocks'}->{$filename_full_path}) { 260 263 $self->{'num_blocked'} ++; -
main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm
r23352 r23363 186 186 # read in file ($text will be in utf8) 187 187 my $raw_text = ""; 188 $self->read_file_no_decoding 188 $self->read_file_no_decoding($filename_full_path, \$raw_text); 189 189 190 190 my $textref = \$raw_text; … … 192 192 my $closecom = '(?:-->|(?:—|—|--)>)'; 193 193 $$textref =~ s/$opencom(.*?)$closecom//gs; 194 195 # Convert entities to their UTF8 equivalents 196 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go; 197 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,0)/gseo; # on this occassion, want it left as utf8 198 $$textref =~ s/&z(lt|gt|amp|quot|nbsp);/&$1;/go; 194 199 195 200 my $attval = "\\\"[^\\\"]+\\\"|[^\\s>]+"; … … 209 214 210 215 # remove quotes from link at start and end if necessary 211 if ($link =~/^\"/) {212 $link =~s/^\"//;213 $link =~s/\"$//;214 } 215 216 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html 216 if ($link =~ m/^\"/) { 217 $link =~ s/^\"//; 218 $link =~ s/\"$//; 219 } 220 221 $link =~ s/\#.*$//s; # remove any anchor names, e.g. foo.html#name becomes foo.html 217 222 # some links may just be anchor names 218 223 next unless ($link =~ /\S+/); … … 242 247 } 243 248 249 250 my $unicode_url_original_filename = decode("utf8",$url_original_filename); 251 252 ## print STDERR "*****!!! Blocking url original filename = $unicode_url_original_filename\n"; 253 254 # Allow for possibility of raw byte version (UTF8) and Unicode versions of file 244 255 $block_hash->{'file_blocks'}->{$url_original_filename} = 1; 256 $block_hash->{'file_blocks'}->{$unicode_url_original_filename} = 1; 245 257 } 246 258 } … … 250 262 # filename*, it does not URL decode any filename if a file by the name of the *URL-encoded* 251 263 # string already exists in the local folder. 264 # 265 # Is the following still true?? 252 266 # Return the original filename corresponding to the parameter URL-encoded filename, and 253 267 # a decoded flag that is set to true iff URL-decoding had to be applied. … … 312 326 $doc_obj->set_source_filename ($collect_file, $self->{'file_rename_method'}); 313 327 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental 314 # build. so set it manually.315 $doc_obj-> {'source_path'} = $filename_full_path;328 # build. So set it manually. 329 $doc_obj->set_source_path($filename_full_path); 316 330 my $collect_conv_file = &util::filename_within_collection($tidy_filename); 317 331 $doc_obj->set_converted_filename($collect_conv_file); … … 387 401 my $utf8_file = &unicode::raw_filename_to_url_encoded($tailname); 388 402 389 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) {390 print STDERR "***!! file = $file\n";391 print STDERR "***!! utf8_file = $utf8_file\n";392 }403 # if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 404 # print STDERR "***!! file = $file\n"; 405 # print STDERR "***!! utf8_file = $utf8_file\n"; 406 # } 393 407 394 408 … … 764 778 ($href =~ m/\/$/) || ($href =~ m/^(mailto|news|gopher|nntp|telnet|javascript):/i)) { 765 779 766 767 780 # If web page didn't give encoding, then default to utf8 781 my $content_encoding= $self->{'content_encoding'} || "utf8"; 782 768 783 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 769 print STDERR "*** Web page didn't give encoding, defaulting to UTF8!\n"; 770 print STDERR "***** looking up $file\n"; 771 } 772 773 my $content_encoding= $self->{'content_encoding'} || "utf8"; 784 print STDERR "*** Encoding with $content_encoding href: $href\n"; 785 } 786 774 787 $href = encode($content_encoding,$href); 775 788 … … 807 820 808 821 $filename = &util::filename_cat($base_dir, $filename); 822 809 823 if (($self->{'use_realistic_book'}) || ($self->{'old_style_HDL'})) { 810 824 # we are processing a tidytmp file - want paths to be in import … … 827 841 $filename = encode($content_encoding, $opt_decode_utf8_filename); 828 842 829 830 843 # some special processing if the intended filename was converted to utf8, but 831 844 # the actual file still needs to be renamed 832 if (! -e $filename) {845 if (!&util::fd_exists($filename)) { 833 846 # try the original filename stored in map 834 847 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 835 print STDERR "***###!! orig filename did not exist: $filename\n"; 836 } 848 print STDERR "******!! orig filename did not exist: $filename\n"; 849 } 850 851 ## print STDERR "**** trying to look up utf8_filename: $utf8_filename\n"; 837 852 838 853 my $original_filename = $self->{'utf8_to_original_filename'}->{$utf8_filename}; 839 854 840 855 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 841 print STDERR "**** Trying for$original_filename\n";856 print STDERR "****** From lookup utf8_filename, now trying for: $original_filename\n"; 842 857 } 843 858 844 859 if (defined $original_filename && -e $original_filename) { 845 860 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 846 print STDERR "*** found match\n";861 print STDERR "****** Found match!\n"; 847 862 } 848 863 $filename = $original_filename; … … 891 906 $newname = &util::rename_file($newname, $self->{'file_rename_method'}); 892 907 908 ### print STDERR "***** associating $filename (raw-byte/utf8)-> $newname\n"; 893 909 $doc_obj->associate_file($filename, $newname, undef, $section); 894 910 … … 1296 1312 $self->SUPER::read_file($filename, $encoding, $language, $textref); 1297 1313 1298 # Convert entities to their U TF8equivalents1314 # Convert entities to their Unicode code-point equivalents 1299 1315 $$textref =~ s/&(lt|gt|amp|quot|nbsp);/&z$1;/go; 1300 1316 $$textref =~ s/&([^;]+);/&ghtml::getcharequiv($1,1,1)/gseo; -
main/trunk/greenstone2/perllib/plugins/PagedImagePlugin.pm
r23352 r23363 563 563 $self->add_dummy_text($doc_obj, $topsection); 564 564 } 565 566 567 565 } 568 566 -
main/trunk/greenstone2/perllib/plugins/PowerPointPlugin.pm
r23352 r23363 341 341 ## set_source_filename does not set the doc_obj source_path which is used in archives dbs for incremental 342 342 # build. so set it manually. 343 $doc_obj-> {'source_path'} = $filename_full_path;343 $doc_obj->set_source_path($filename_full_path); 344 344 $doc_obj->set_converted_filename(&util::filename_cat($dirname_within_collection, $file)); 345 345 -
main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm
r23352 r23363 122 122 # should we move this to read? What about secondary plugins? 123 123 print STDERR "<Processing n='$file' p='$self->{'plugin_type'}'>\n" if ($gli); 124 print $outhandle "$self->{'plugin_type'} processing $file\n" 124 my $pp_file = &util::prettyprint_file($base_dir,$file); 125 print $outhandle "$self->{'plugin_type'} processing $pp_file\n" 125 126 if $self->{'verbosity'} > 1; 126 127 -
main/trunk/greenstone2/perllib/plugouts/BasePlugout.pm
r22818 r23363 640 640 $dir = "" unless defined $dir; 641 641 642 643 my $real_filename = $assoc_file_rec->[0]; 642 my $utf8_real_filename = $assoc_file_rec->[0]; 643 644 644 # for some reasons the image associate file has / before the full path 645 $real_filename =~ s/^\\(.*)/$1/i; 645 $utf8_real_filename =~ s/^\\(.*)/$1/i; 646 647 my $real_filename = &util::utf8_to_real_filename($utf8_real_filename); 648 646 649 if (-e $real_filename) { 647 650 … … 653 656 "gsdlassocfile", 654 657 "$afile:$assoc_file_rec->[2]:$dir"); 655 } elsif ($self->{'verbosity'} > 2) {658 } elsif ($self->{'verbosity'} > 1) { 656 659 print $outhandle "BasePlugout::process couldn't copy the associated file " . 657 660 "$real_filename to $afile\n"; … … 688 691 # for some reasons the image associate file has / before the full path 689 692 $real_filename =~ s/^\\(.*)/$1/i; 690 if (-e $real_filename) { 693 694 my $raw_filename; 695 if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 696 # Need to generate DOS version of filename to test in '-e $raw_filename' below 697 require Win32; 698 my $unicode_filename = Encode::decode("utf8",$real_filename); 699 ### print STDERR "***### files to field: ", &unicode::debug_unicode_string($unicode_filename),"\n"; 700 701 $raw_filename = Win32::GetShortPathName($unicode_filename); 702 } 703 else { 704 $raw_filename = $real_filename; 705 } 706 707 if (-e $raw_filename) { 691 708 692 709 # if (defined $collect_dir) { … … 701 718 $reverse_lookups->{$real_filename} = 1; 702 719 } 703 push(@{$oid_files->{$field}},$full_file); 720 ### push(@{$oid_files->{$field}},$full_file); 721 push(@{$oid_files->{$field}},$raw_filename); 704 722 } 705 723 else {
Note:
See TracChangeset
for help on using the changeset viewer.