Changeset 23457 for main/trunk/greenstone2
- Timestamp:
- 2010-12-13T14:22:45+13:00 (13 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/BasePlugin.pm
r23419 r23457 63 63 { 'name' => "unicode", 64 64 'desc' => "{BasePlugin.encoding.unicode}" } ]; 65 65 66 66 67 67 my $e = $encodings::encodings; … … 78 78 [ { 'name' => "auto", 79 79 'desc' => "{BasePlugin.filename_encoding.auto}" }, 80 80 { 'name' => "auto-language-analysis", 81 81 'desc' => "{BasePlugin.filename_encoding.auto_language_analysis}" }, # textcat 82 82 { 'name' => "auto-filesystem-encoding", … … 166 166 'list' => $file_rename_method_list, 167 167 'reqd' => "no" 168 168 } 169 169 170 170 ]; … … 384 384 sub block_filename 385 385 { 386 my $self = shift(@_); 387 my ($block_hash,$filename) = @_; 386 my $self = shift(@_); 387 my ($block_hash,$filename) = @_; 388 389 if ($ENV{'GSDLOS'} =~ m/^windows$/) { 388 390 389 if ($ENV{'GSDLOS'} =~ m/^windows$/) { 390 391 my $lower_drive = $filename; 392 $lower_drive =~ s/^([A-Z]):/\l$1:/i; 393 394 my $upper_drive = $filename; 395 $upper_drive =~ s/^([A-Z]):/\u$1:/i; 396 397 $block_hash->{'file_blocks'}->{$lower_drive} = 1; 398 $block_hash->{'file_blocks'}->{$upper_drive} = 1; 399 } 400 else { 401 $block_hash->{'file_blocks'}->{$filename} = 1; 402 } 391 my $lower_drive = $filename; 392 $lower_drive =~ s/^([A-Z]):/\l$1:/i; 393 394 my $upper_drive = $filename; 395 $upper_drive =~ s/^([A-Z]):/\u$1:/i; 396 397 $block_hash->{'file_blocks'}->{$lower_drive} = 1; 398 $block_hash->{'file_blocks'}->{$upper_drive} = 1; 399 } 400 else { 401 $block_hash->{'file_blocks'}->{$filename} = 1; 402 } 403 403 } 404 404 … … 516 516 return undef; # can't recognise 517 517 } 518 518 519 519 # if we have a block_exp, then this overrides the normal 'smart' blocking 520 520 $self->store_block_files($filename_full_path, $block_hash) unless ($self->{'no_blocking'} || $self->{'block_exp'} ne ""); … … 524 524 $self->block_cover_image($filename_full_path, $block_hash); 525 525 } 526 526 527 527 return 1; 528 528 } … … 555 555 # check if the filename is already in UTF8. If it is, then we're done. 556 556 if($filename_encoding =~ m/auto/) { 557 558 559 560 561 557 if(&unicode::check_is_utf8($filemeta)) 558 { 559 $filename_encoding = "utf8"; 560 return $filemeta; 561 } 562 562 } 563 563 … … 565 565 if ($filename_encoding eq "auto") 566 566 { 567 568 567 # try textcat 568 $filename_encoding = $self->textcat_encoding($filemeta); 569 569 570 571 570 # check the locale next 571 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined"; 572 572 573 574 575 576 577 573 574 # now try the encoding of the document, if available 575 if ($filename_encoding eq "undefined" && defined $file_encoding) { 576 $filename_encoding = $file_encoding; 577 } 578 578 579 579 } … … 602 602 # try textcat 603 603 $filename_encoding = $self->textcat_encoding($filemeta) if $filename_encoding eq "undefined"; 604 604 605 605 # else assume filename encoding is encoding of file content, if that's available 606 606 if ($filename_encoding eq "undefined" && defined $file_encoding) { … … 608 608 } 609 609 } 610 610 611 611 elsif ($filename_encoding eq "auto-lf") 612 612 { … … 622 622 $filename_encoding = $self->locale_encoding() if $filename_encoding eq "undefined"; 623 623 } 624 624 625 625 # if still undefined, use utf8 as fallback 626 626 if ($filename_encoding eq "undefined") { … … 642 642 if ($filename_encoding !~ m/(?:ascii|utf8|unicode)/) { 643 643 $filemeta = &unicode::unicode2utf8( 644 &unicode::convert2unicode($filename_encoding, \$filemeta)645 );644 &unicode::convert2unicode($filename_encoding, \$filemeta) 645 ); 646 646 } 647 647 … … 659 659 my $outhandle = $self->{'outhandle'}; 660 660 661 662 663 661 print $outhandle "****!!!!**** BasePlugin::filename_to_utf8_metadata now deprecated\n"; 662 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(0); 663 print $outhandle "Calling method: $cfilename:$cline $cpackage->$csubr\n"; 664 664 665 665 … … 709 709 my $outhandle = $self->{'outhandle'}; 710 710 my $best_encoding = undef; 711 711 712 712 # get the language/encoding of the textstring using textcat 713 713 require textcat; # Only load the textcat module if it is required … … 727 727 return undef; 728 728 } 729 729 730 730 if (defined $best_encoding && $best_encoding =~ m/^iso_8859/ && &unicode::check_is_utf8($text)) { 731 731 # the text is valid utf8, so assume that's the real encoding (since textcat is based on probabilities) … … 840 840 # Start by looking for manually assigned metadata 841 841 if (defined $gs_filename_encoding) { 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 } 857 842 if (ref ($gs_filename_encoding) eq "ARRAY") { 843 my $outhandle = $self->{'outhandle'}; 844 845 $deduced_filename_encoding = $gs_filename_encoding->[0]; 846 847 my $num_vals = scalar(@$gs_filename_encoding); 848 if ($num_vals>1) { 849 print $outhandle "Warning: gs.filename_encoding multiply defined for $file\n"; 850 print $outhandle " Selecting first value: $deduced_filename_encoding\n"; 851 } 852 } 853 else { 854 $deduced_filename_encoding = $gs_filename_encoding; 855 } 856 } 857 858 858 if (!defined $deduced_filename_encoding || ($deduced_filename_encoding =~ m/^\s*$/)) { 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 859 # Look to see if plugin specifies this value 860 861 if (defined $plugin_filename_encoding) { 862 # First look to see if we're using any of the "older" (i.e. deprecated auto-... plugin options) 863 if ($plugin_filename_encoding =~ m/^auto-.*$/) { 864 my $outhandle = $self->{'outhandle'}; 865 print $outhandle "Warning: $plugin_filename_encoding is no longer supported\n"; 866 print $outhandle " default to 'auto'\n"; 867 $self->{'filename_encoding'} = $plugin_filename_encoding = "auto"; 868 } 869 870 if ($plugin_filename_encoding ne "auto") { 871 # We've been given a specific filenamne encoding 872 # => so use it! 873 $deduced_filename_encoding = $plugin_filename_encoding; 874 } 875 } 876 876 } 877 877 … … 892 892 # See if we can determine the file system encoding through locale 893 893 $deduced_filename_encoding = $self->locale_encoding(); 894 894 895 895 # if locale shows us filesystem is utf8, check to see filename is consistent 896 896 # => if not, then we have an "alien" filename on our hands 897 897 898 898 if ($deduced_filename_encoding =~ m/^utf-?8$/i) { 899 899 if (!&unicode::check_is_utf8($file)) { … … 910 910 # } 911 911 912 913 914 915 916 917 918 919 920 921 922 912 if ($self->{'verbosity'}>3) { 913 my $outhandle = $self->{'outhandle'}; 914 915 if (defined $deduced_filename_encoding) { 916 print $outhandle " Deduced filename encoding as: $deduced_filename_encoding\n"; 917 } 918 else { 919 print $outhandle " No filename encoding deduced\n"; 920 } 921 } 922 923 923 return $deduced_filename_encoding; 924 924 } … … 998 998 # } 999 999 } 1000 1000 1001 1001 # this should be called by all plugins to set the oid of the doc obj, rather 1002 1002 # than calling doc_obj->set_OID directly … … 1017 1017 1018 1018 } 1019 1019 1020 1020 # The BasePlugin read_into_doc_obj() function. This function does all the 1021 1021 # right things to make general options work for a given plugin. It doesn't do anything with the file other than setting reads in … … 1045 1045 my $pp_file = &util::prettyprint_file($base_dir,$file); 1046 1046 print $outhandle "$self->{'plugin_type'} processing $pp_file\n" 1047 1047 if $self->{'verbosity'} > 1; 1048 1048 1049 1049 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file); 1050 1050 1051 1051 # create a new document 1052 1052 my $doc_obj = new doc ($filename_full_path, "indexed_doc", $self->{'file_rename_method'}); … … 1055 1055 $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}"); 1056 1056 $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path)); 1057 1057 1058 1058 1059 1059 my $plugin_filename_encoding = $self->{'filename_encoding'}; … … 1123 1123 $self->associate_source_file($doc_obj, $filename); 1124 1124 } 1125 1125 1126 1126 1127 1127 } … … 1192 1192 gsprintf(STDERR, "BasePlugin::process {common.must_be_implemented}\n"); 1193 1193 1194 1194 my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1); 1195 1195 print STDERR "Calling method: $cfilename:$cline $cpackage->$csubr\n"; 1196 1196 1197 1197 die "\n"; 1198 1198 1199 1199 return undef; # never gets here … … 1215 1215 if (!open (FILE, ">:utf8", $filename)) { 1216 1216 gsprintf(STDERR, "ConvertToPlug::write_file {ConvertToPlug.could_not_open_for_writing} ($!)\n", $filename); 1217 1218 1217 die "\n"; 1218 } 1219 1219 print FILE $$textref; 1220 1220 … … 1265 1265 } 1266 1266 } 1267 1268 } 1269 1267 1268 } 1269 1270 1270 # add any extra metadata that's been passed around from one 1271 1271 # plugin to another.
Note:
See TracChangeset
for help on using the changeset viewer.