Changeset 23352
- Timestamp:
- 2010-11-28T23:24:22+13:00 (13 years ago)
- Location:
- main/trunk/greenstone2/perllib/plugins
- Files:
-
- 15 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/BasePlugin.pm
r23347 r23352 565 565 566 566 elsif ($filename_encoding eq "auto-filesystem-encoding") 567 { 567 { 568 568 # try locale 569 569 $filename_encoding = $self->locale_encoding(); … … 740 740 741 741 # uses locale 742 sub get_filesystem_encoding { 742 sub get_filesystem_encoding 743 { 743 744 744 745 my $self = shift(@_); … … 748 749 749 750 eval { 751 # Works for Windows as well, returning the DOS code page in use 750 752 use POSIX qw(locale_h); 751 753 … … 788 790 789 791 } 792 790 793 return $filesystem_encoding; 791 794 } … … 840 843 841 844 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) { 842 # See if we can determine the file system encoding through locale 843 $deduced_filename_encoding = $self->locale_encoding(); 844 845 # if locale shows us filesystem is utf8, check to see filename is consistent 846 # => if not, then we have an "alien" filename on our hands 847 848 if ($deduced_filename_encoding =~ m/^utf-?8$/i) { 849 if (!&unicode::check_is_utf8($file)) { 850 # "alien" filename, so revert 851 $deduced_filename_encoding = undef; 852 } 845 846 # Look to file system to provide a character encoding 847 848 # If Windows NTFS, then -- assuming we work with long file names got through 849 # Win32::GetLongFilePath() -- then the underlying file system is UTF16 850 851 if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 852 # Can do better than working with the DOS character encoding returned by locale 853 $deduced_filename_encoding = "unicode"; 854 } 855 else { 856 # Unix of some form or other 857 858 # See if we can determine the file system encoding through locale 859 $deduced_filename_encoding = $self->locale_encoding(); 860 861 # if locale shows us filesystem is utf8, check to see filename is consistent 862 # => if not, then we have an "alien" filename on our hands 863 864 if ($deduced_filename_encoding =~ m/^utf-?8$/i) { 865 if (!&unicode::check_is_utf8($file)) { 866 # "alien" filename, so revert 867 $deduced_filename_encoding = undef; 853 868 } 854 } 855 869 } 870 } 871 } 856 872 857 873 # if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) { … … 884 900 sub set_Source_metadata { 885 901 my $self = shift (@_); 886 my ($doc_obj, $raw_file , $filename_encoding) = @_;902 my ($doc_obj, $raw_filename, $filename_encoding) = @_; 887 903 888 904 # 1. Sets the filename (Source) for display encoded as Unicode if possible, … … 890 906 # 2. Sets the url ref (SourceFile) to the URL encoded version 891 907 # of filename for generated files 908 909 my ($unused_full_rf, $raw_file) = &util::get_full_filenames("", $raw_filename); 892 910 893 911 my $top_section = $doc_obj->get_top_section(); 894 912 913 my $octet_file = $raw_file; 914 895 915 # UTF-8 version of filename 896 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 897 print STDERR "****** Setting Source Metadata given: $raw_file\n"; 898 } 916 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 917 print STDERR "****** Setting Source Metadata given: $octet_file\n"; 918 } 919 920 # Deal with (on Windows) raw filenames that are in their 921 # abbreviated DOS form 922 923 if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 924 if ((defined $filename_encoding) && ($filename_encoding eq "unicode")) { 925 if (-e $raw_filename) { 926 require Win32; 927 928 ## print STDERR "**** raw filename before LPN: $raw_filename\n"; 929 my $unicode_filename = Win32::GetLongPathName($raw_filename); 930 931 my $unused_full_uf; 932 ($unused_full_uf, $octet_file) = &util::get_full_filenames("", $unicode_filename); 933 934 ## print STDERR "**** raw filename after LPN: $raw_filename\n"; 935 } 936 } 937 } 899 938 900 939 my $url_encoded_filename; 901 940 if (defined $filename_encoding) { 902 903 904 905 $url_encoded_filename = decode($filename_encoding,$raw_file);941 # => Generate a pretty print version of filename that is mapped to Unicode 942 943 # Use filename_encoding to map raw filename to a Perl unicode-aware string 944 $url_encoded_filename = decode($filename_encoding,$octet_file); 906 945 } 907 946 else { 908 909 $url_encoded_filename = &unicode::raw_filename_to_url_encoded($raw_file);910 } 911 912 913 914 915 947 # otherwise generate %xx encoded version of filename for char > 127 948 $url_encoded_filename = &unicode::raw_filename_to_url_encoded($octet_file); 949 } 950 951 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 952 print STDERR "***** saving Source as: $url_encoded_filename\n"; 953 } 954 916 955 917 956 # Source is the UTF8 display name - not necessarily the name of the file on the system 918 957 $doc_obj->set_utf8_metadata_element($top_section, "Source", $url_encoded_filename); 919 958 920 959 my $renamed_raw_file = &util::rename_file($raw_file, $self->{'file_rename_method'}); 921 960 # If using URL encoding, then SourceFile is the url-reference to url-encoded … … 926 965 $renamed_raw_url); 927 966 928 929 930 967 if ((defined $ENV{"DEBUG_UNICODE"}) && ($ENV{"DEBUG_UNICODE"})) { 968 print STDERR "***** saving SourceFile as: $renamed_raw_url\n"; 969 } 931 970 } 932 971 … … 988 1027 989 1028 990 1029 my $plugin_filename_encoding = $self->{'filename_encoding'}; 991 1030 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 992 $self->set_Source_metadata($doc_obj,$filename_ no_path,$filename_encoding);1031 $self->set_Source_metadata($doc_obj,$filename_full_path,$filename_encoding,$filename_full_path); 993 1032 994 1033 # plugin specific stuff - what args do we need here?? -
main/trunk/greenstone2/perllib/plugins/CONTENTdmPlugin.pm
r23349 r23352 661 661 my $plugin_filename_encoding = $self->{'filename_encoding'}; 662 662 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 663 $self->set_Source_metadata($doc_obj, $file meta, $filename_encoding);663 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding); 664 664 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 665 665 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "FileSize", (-s $filename_full_path)); -
main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm
r23349 r23352 399 399 my $plugin_filename_encoding = $self->{'filename_encoding'}; 400 400 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 401 $self->set_Source_metadata($doc_obj, $filename_ no_path, $filename_encoding);401 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding); 402 402 403 403 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); -
main/trunk/greenstone2/perllib/plugins/ConvertToRogPlugin.pm
r23349 r23352 347 347 my $plugin_filename_encoding = $self->{'filename_encoding'}; 348 348 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 349 $self->set_Source_metadata($doc_obj, $ filemeta, $filename_encoding);349 $self->set_Source_metadata($doc_obj, $conv_filename, $filename_encoding); 350 350 351 351 if ($self->{'cover_image'}) { -
main/trunk/greenstone2/perllib/plugins/DatabasePlugin.pm
r23349 r23352 272 272 273 273 my $plugin_filename_encoding = $self->{'filename_encoding'}; 274 275 $self->set_Source_metadata($doc_obj, $ db, $filename_encoding);274 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 275 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding); 276 276 277 277 if ($self->{'cover_image'}) { -
main/trunk/greenstone2/perllib/plugins/HTMLPlugin.pm
r23349 r23352 318 318 319 319 my $plugin_filename_encoding = $self->{'filename_encoding'}; 320 321 $self->set_Source_metadata($doc_obj, $filename_ no_path, $filename_encoding);320 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 321 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding); 322 322 } 323 323 -
main/trunk/greenstone2/perllib/plugins/ImageConverter.pm
r23349 r23352 152 152 sub generate_images { 153 153 my $self = shift(@_); 154 my ($filename_full_path, $filename_no_path, $doc_obj, $section, $filename_encoding) = @_; 154 my ($filename_full_path, $filename_encoded_full_path, $doc_obj, $section, $filename_encoding) = @_; 155 156 my ($unused_fefp,$filename_encoded_no_path) 157 = util::get_full_filenames("",$filename_encoded_full_path); 158 159 # The following is potentially very muddled thinking (but currently seems to work) 160 # generate_images currently called from ImagePlugin and PagedImagePlugin 161 my $filename_no_path = $filename_encoded_no_path; 155 162 156 163 # check image magick status … … 219 226 # $self->set_Source_metadata($doc_obj,$url_to_filename_no_path,undef); 220 227 221 $self->set_Source_metadata($doc_obj,&unicode::url_decode($filename_no_path), 222 $filename_encoding); 228 my $raw_filename_full_path = &unicode::url_decode($filename_encoded_full_path); 229 $self->set_Source_metadata($doc_obj,$raw_filename_full_path, 230 $filename_encoding); 223 231 224 232 -
main/trunk/greenstone2/perllib/plugins/MARCXMLPlugin.pm
r23349 r23352 227 227 my $plugin_filename_encoding = $self->{'filename_encoding'}; 228 228 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 229 $self->set_Source_metadata($doc_obj, $file meta, $filename_encoding);229 $self->set_Source_metadata($doc_obj, $filename, $filename_encoding); 230 230 231 231 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$self->{'record_count'}"); -
main/trunk/greenstone2/perllib/plugins/OAIPlugin.pm
r23349 r23352 297 297 298 298 my ($filemeta) = $file =~ /([^\\\/]+)$/; 299 299 my $plugin_filename_encoding = $self->{'filename_encoding'}; 300 300 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 301 $self->set_Source_metadata($doc_obj, $file meta, $filename_encoding);301 $self->set_Source_metadata($doc_obj, $filename, $filename_encoding); 302 302 303 303 $doc_obj->add_utf8_metadata($top_section, "Language", $language); -
main/trunk/greenstone2/perllib/plugins/OpenDocumentPlugin.pm
r23349 r23352 268 268 my $filename_encoding = $self->deduce_filename_encoding($file_only,$metadata,$plugin_filename_encoding); 269 269 270 $self->set_Source_metadata($doc_obj, $file _only, $filename_encoding);271 270 $self->set_Source_metadata($doc_obj, $filename, $filename_encoding); 271 $doc_obj->set_utf8_metadata_element("", "FileSize", (-s $filename)); 272 272 273 273 # include any metadata passed in from previous plugins -
main/trunk/greenstone2/perllib/plugins/PagedImagePlugin.pm
r23349 r23352 426 426 my $result = 0; 427 427 if ($self->{'image_conversion_available'} == 1) { 428 # do we need to convert $filename_no_path to utf8? We are already reading in from a file, what encoding is it in??? 429 $result = $self->generate_images($filename_full_path, $filename_no_path, $doc_obj, $section); 428 # do we need to convert $filename_no_path to utf8/url encoded? 429 # We are already reading in from a file, what encoding is it in??? 430 my $url_encoded_full_filename 431 = &unicode::raw_filename_to_url_encoded($filename_full_path); 432 $result = $self->generate_images($filename_full_path, $url_encoded_full_filename, $doc_obj, $section); 430 433 } 431 434 #overwrite one set in ImageConverter … … 513 516 $self->{'doc_obj'} = new doc ($self->{'filename'}, "indexed_doc", $self->{'file_rename_method'}); 514 517 # TODO is file filenmae_no_path?? 515 $self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'file '}, $self->{'processor'}, $self->{'metadata'});518 $self->set_initial_doc_fields($self->{'doc_obj'}, $self->{'filename'}, $self->{'processor'}, $self->{'metadata'}); 516 519 517 520 my ($dir, $file) = $self->{'filename'} =~ /^(.*?)([^\/\\]*)$/; … … 540 543 sub set_initial_doc_fields { 541 544 my $self = shift(@_); 542 my ($doc_obj, $filename_ no_path, $processor, $metadata) = @_;545 my ($doc_obj, $filename_full_path, $processor, $metadata) = @_; 543 546 544 547 my $topsection = $doc_obj->get_top_section(); … … 552 555 } 553 556 554 555 my $filename_encoding = $self->deduce_filename_encoding($filename_ no_path,$metadata,$plugin_filename_encoding);556 $self->set_Source_metadata($doc_obj, $filename_ no_path, $filename_encoding);557 my $plugin_filename_encoding = $self->{'filename_encoding'}; 558 my $filename_encoding = $self->deduce_filename_encoding($filename_full_path,$metadata,$plugin_filename_encoding); 559 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding); 557 560 558 561 # if we want a header page, we need to add some text into the top section, otherwise this section will become invisible … … 620 623 621 624 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'}); 622 $self->set_initial_doc_fields($doc_obj, $filename_ no_path, $processor, $metadata);625 $self->set_initial_doc_fields($doc_obj, $filename_full_path, $processor, $metadata); 623 626 my $topsection = $doc_obj->get_top_section(); 624 627 open (ITEMFILE, $filename_full_path) || die "couldn't open $filename_full_path\n"; -
main/trunk/greenstone2/perllib/plugins/PowerPointPlugin.pm
r23349 r23352 346 346 my $plugin_filename_encoding = $self->{'filename_encoding'}; 347 347 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 348 $self->set_Source_metadata($doc_obj, $filename_ no_path,$filename_encoding);348 $self->set_Source_metadata($doc_obj, $filename_full_path,$filename_encoding); 349 349 350 350 $doc_obj->set_utf8_metadata_element($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); -
main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm
r23348 r23352 141 141 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path)); 142 142 143 143 my $plugin_filename_encoding = $self->{'filename_encoding'}; 144 144 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 145 $self->set_Source_metadata($doc_obj, $filename_ no_path, $filename_encoding);145 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding); 146 146 147 147 $doc_obj->add_utf8_metadata($top_section, "Language", $language); -
main/trunk/greenstone2/perllib/plugins/ReadXMLFile.pm
r23349 r23352 369 369 my $self = shift(@_); 370 370 371 my $metadata = $self->{'metadata'}; 371 my $metadata = $self->{'metadata'}; 372 my $filename_full_path = $self->{'filename'}; 372 373 373 374 # create a new document 374 my $doc_obj = $self->{'doc_obj'} = new doc ($ self->{'filename'}, "indexed_doc", $self->{'file_rename_method'});375 my $doc_obj = $self->{'doc_obj'} = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'}); 375 376 376 377 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}"); 377 378 378 379 379 my $filename_no_path = $self->{'filename_no_path'}; 380 my $plugin_filename_encoding = $self->{'filename_encoding'}; 380 381 my $filename_encoding = $self->deduce_filename_encoding($filename_no_path,$metadata,$plugin_filename_encoding); 381 382 382 $self->set_Source_metadata($doc_obj, $filename_ no_path, $filename_encoding);383 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding); 383 384 384 385 # do we want other auto metadata here (see BasePlugin.read_into_doc_obj) -
main/trunk/greenstone2/perllib/plugins/SplitTextFile.pm
r23349 r23352 243 243 my $plugin_filename_encoding = $self->{'filename_encoding'}; 244 244 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding); 245 $self->set_Source_metadata($doc_obj, $file meta, $filename_encoding);245 $self->set_Source_metadata($doc_obj, $filename_full_path, $filename_encoding); 246 246 247 247 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "SourceSegment", "$segment");
Note:
See TracChangeset
for help on using the changeset viewer.