Changeset 11389
- Timestamp:
- 2006-03-17T13:42:39+13:00 (18 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BasPlug.pm
r11368 r11389 45 45 use multiread; 46 46 use encodings; 47 use unicode; 47 48 use cnseg; 48 49 use acronym; … … 808 809 # Do encoding stuff 809 810 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename); 811 if ($self->{'verbosity'} > 2) { 812 print $outhandle "BasPlug: reading $file as ($encoding,$language)\n"; 813 } 810 814 811 815 # create a new document … … 819 823 my ($filemeta) = $file =~ /([^\\\/]+)$/; 820 824 # how do we know what encoding the filename is in? 821 $doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta)); 825 # assume it is in the same encoding as its contents 826 if ($encoding !~ /(?:ascii|utf8|unicode)/) { 827 $filemeta = unicode::unicode2utf8( 828 unicode::convert2unicode($encoding, \$filemeta) 829 ); 830 } 831 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta)); 822 832 if ($self->{'cover_image'}) { 823 833 $self->associate_cover_image($doc_obj, $filename); … … 1028 1038 my $outhandle = $self->{'outhandle'}; 1029 1039 gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding); 1030 # print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";1031 # print $outhandle "appears to be encoded as $extracted_encoding.\n";1032 1040 } 1033 1041 } else { … … 1047 1055 my $outhandle = $self->{'outhandle'}; 1048 1056 my $unicode_format = ""; 1057 my $best_language = ""; 1058 my $best_encoding = ""; 1059 1049 1060 # read in file 1050 open (FILE, $filename) || (gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n"; 1061 if (!open (FILE, $filename)) { 1062 gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename); 1063 # this is a pretty bad error, but try to continue anyway 1064 return ($self->{'default_language'}, $self->{'input_encoding'}); 1065 } 1051 1066 undef $/; 1052 1067 my $text = <FILE>; … … 1069 1084 } 1070 1085 1071 # VB scripting generated Word to HTML file 1072 if ($text =~ /charset=(windows.*)[\"]/ig){ 1073 my $vbhtml_encoding = $1; 1074 $vbhtml_encoding =~ s/-+/_/g; 1075 $self->{'input_encoding'} = $vbhtml_encoding; 1076 } 1077 1078 # remove <title>stuff</title> -- as titles tend often to be in English 1079 # for foreign language documents 1080 $text =~ s/<title>(.|\n)*?<\/title>//i; 1081 1082 # remove all HTML tags 1086 1087 # handle html files specially 1083 1088 # XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo) 1084 1089 if (ref($self) eq 'HTMLPlug' || 1085 1090 (exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){ 1091 1092 # remove <title>stuff</title> -- as titles tend often to be in English 1093 # for foreign language documents 1094 $text =~ s!<title>.*?</title>!!si; 1095 1096 # see if this html file specifies its encoding 1097 if ($text =~ /^<\?xml.*encoding="(.+?)"/) { 1098 $best_encoding = $1; 1099 } elsif ($text =~ /<meta http-equiv.*content-type.*charset=(.+?)"/i) { 1100 $best_encoding = $1; 1101 } 1102 if ($best_encoding) { # we extracted an encoding 1103 $best_encoding =~ s/-+/_/g; 1104 $best_encoding = lc($best_encoding); # lowercase 1105 if ($best_encoding eq "utf_8") { $best_encoding = "utf8" } 1106 $self->{'input_encoding'} = $best_encoding; 1107 } 1108 1109 # remove all HTML tags 1086 1110 $text =~ s/<[^>]*>//sg; 1087 1111 } … … 1093 1117 # first one in the list - otherwise use the defaults 1094 1118 if (scalar @$results > 3) { 1095 my $best_encoding="";1096 1119 if ($unicode_format) { # in case the first had a BOM 1097 1120 $best_encoding=$unicode_format; … … 1122 1145 $filename, $self->{'default_language'}); 1123 1146 } 1124 return ($self->{'default_language'}, $self->{'input_encoding'}); 1147 $best_language = $self->{'default_language'}; 1148 $best_encoding = $self->{'input_encoding'}; 1125 1149 1126 1150 } else { … … 1130 1154 $filename, $self->{'default_language'}); 1131 1155 } 1132 return ($self->{'default_language'}, $best_encoding); 1133 } 1134 } 1135 1136 # format language/encoding 1137 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/; 1138 if (!defined $language) { 1139 if ($self->{'verbosity'}>2) { 1140 gsprintf($outhandle, 1141 "BasPlug: {BasPlug.could_not_extract_language}\n", 1142 $filename, $self->{'default_language'}); 1143 } 1144 $language = $self->{'default_language'}; 1145 } 1146 if (!defined $encoding) { 1147 if ($self->{'verbosity'}>2) { 1148 gsprintf($outhandle, 1149 "BasPlug: {BasPlug.could_not_extract_encoding}\n", 1150 $filename, $self->{'default_encoding'}); 1151 } 1152 $encoding = $self->{'default_encoding'}; 1153 } 1154 1156 $best_language = $self->{'default_language'}; 1157 } 1158 } else { # <= 3 suggestions 1159 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/; 1160 if (!defined $language) { 1161 if ($self->{'verbosity'}>2) { 1162 gsprintf($outhandle, 1163 "BasPlug: {BasPlug.could_not_extract_language}\n", 1164 $filename, $self->{'default_language'}); 1165 } 1166 $language = $self->{'default_language'}; 1167 } 1168 if (!defined $encoding) { 1169 if ($self->{'verbosity'}>2) { 1170 gsprintf($outhandle, 1171 "BasPlug: {BasPlug.could_not_extract_encoding}\n", 1172 $filename, $self->{'default_encoding'}); 1173 } 1174 $encoding = $self->{'default_encoding'}; 1175 } 1176 $best_language = $language; 1177 if (! $best_encoding ) { # may already be set... eg from html meta tag 1178 $best_encoding = $encoding; 1179 } 1180 } 1181 1182 my $text_copy = $text; 1183 if ($best_encoding =~ /^iso_8859/ && unicode::ensure_utf8(\$text_copy)==0) { 1184 # the text is valid utf8, so assume that's the real encoding 1185 # (since textcat is based on probabilities) 1186 $best_encoding = 'utf8'; 1187 } 1155 1188 1156 1189 # check for equivalents where textcat doesn't have some encodings... 1157 1190 # eg MS versions of standard encodings 1158 if ($ encoding =~ /^iso_8859_(\d+)/) {1191 if ($best_encoding =~ /^iso_8859_(\d+)/) { 1159 1192 my $iso = $1; # which variant of the iso standard? 1160 1193 # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do 1161 1194 if ($text =~ /[\x80-\x9f]/) { 1162 1195 # Western Europe 1163 if ($iso == 1 or $iso == 15) { $ encoding = 'windows_1252' }1164 elsif ($iso == 2) { $encoding = 'windows_1250'} # Central Europe1165 elsif ($iso == 5) { $encoding = 'windows_1251'} # Cyrillic1166 elsif ($iso == 6) { $encoding = 'windows_1256'} # Arabic1167 elsif ($iso == 7) { $encoding = 'windows_1253'} # Greek1168 elsif ($iso == 8) { $encoding = 'windows_1255'} # Hebrew1169 elsif ($iso == 9) { $encoding = 'windows_1254'} # Turkish1170 } 1171 } 1172 1173 if ($ encoding !~ /^(ascii|utf8|unicode)$/ &&1174 !defined $encodings::encodings->{$ encoding}) {1196 if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' } 1197 elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe 1198 elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic 1199 elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic 1200 elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek 1201 elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew 1202 elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish 1203 } 1204 } 1205 1206 if ($best_encoding !~ /^(ascii|utf8|unicode)$/ && 1207 !defined $encodings::encodings->{$best_encoding}) { 1175 1208 if ($self->{'verbosity'}) { 1176 1209 gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n", 1177 $filename, $ encoding, $self->{'default_encoding'});1178 } 1179 $ encoding = $self->{'default_encoding'};1180 } 1181 1182 return ($ language, $encoding);1210 $filename, $best_encoding, $self->{'default_encoding'}); 1211 } 1212 $best_encoding = $self->{'default_encoding'}; 1213 } 1214 1215 return ($best_language, $best_encoding); 1183 1216 } 1184 1217
Note:
See TracChangeset
for help on using the changeset viewer.