Changeset 11389


Ignore:
Timestamp:
2006-03-17T13:42:39+13:00 (18 years ago)
Author:
jrm21
Message:

try to get the encoding from a '<meta http-equiv' tag if HTML.

make sure we add the filename/Source metadata as utf-8, so that we
won't create invalid xml files if the filename is in some other encoding.

If the file contents aren't in utf8, assume the filename is in the same
encoding as the contents.

print out the encoding we used to read this file if verbosity >= 3.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r11368 r11389  
    4545use multiread;
    4646use encodings;
     47use unicode;
    4748use cnseg;
    4849use acronym;
     
    808809    # Do encoding stuff
    809810    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
     811    if ($self->{'verbosity'} > 2) {
     812    print $outhandle "BasPlug: reading $file as ($encoding,$language)\n";
     813    }
    810814
    811815    # create a new document
     
    819823    my ($filemeta) = $file =~ /([^\\\/]+)$/;
    820824    # how do we know what encoding the filename is in?
    821     $doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
     825    # assume it is in the same encoding as its contents
     826    if ($encoding !~ /(?:ascii|utf8|unicode)/) {
     827    $filemeta = unicode::unicode2utf8(
     828        unicode::convert2unicode($encoding, \$filemeta)
     829    );
     830    }
     831    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($filemeta));
    822832    if ($self->{'cover_image'}) {
    823833    $self->associate_cover_image($doc_obj, $filename);
     
    10281038        my $outhandle = $self->{'outhandle'};
    10291039        gsprintf($outhandle, "$plugin_name: {BasPlug.wrong_encoding}\n", $filename, $encoding, $extracted_encoding);
    1030             # print $outhandle "$plugin_name: WARNING: $filename was read using $encoding encoding but ";
    1031             # print $outhandle "appears to be encoded as $extracted_encoding.\n";
    10321040        }
    10331041    } else {
     
    10471055    my $outhandle = $self->{'outhandle'};
    10481056    my $unicode_format = "";
     1057    my $best_language = "";
     1058    my $best_encoding = "";
     1059   
    10491060    # read in file
    1050     open (FILE, $filename) || (gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename) && die "\n"); # die "BasPlug::get_language_encoding could not open $filename for reading ($!)\n";
     1061    if (!open (FILE, $filename)) {
     1062    gsprintf(STDERR, "BasPlug::get_language_encoding {BasPlug.could_not_open_for_reading} ($!)\n", $filename);
     1063    # this is a pretty bad error, but try to continue anyway
     1064    return ($self->{'default_language'}, $self->{'input_encoding'});
     1065    }
    10511066    undef $/;
    10521067    my $text = <FILE>;
     
    10691084    }
    10701085   
    1071     # VB scripting generated Word to HTML file
    1072     if ($text =~ /charset=(windows.*)[\"]/ig){
    1073     my $vbhtml_encoding = $1;
    1074     $vbhtml_encoding =~ s/-+/_/g;
    1075     $self->{'input_encoding'} = $vbhtml_encoding;
    1076     }
    1077    
    1078     # remove <title>stuff</title> -- as titles tend often to be in English
    1079     # for foreign language documents
    1080     $text =~ s/<title>(.|\n)*?<\/title>//i;
    1081 
    1082     # remove all HTML tags
     1086
     1087    # handle html files specially
    10831088    # XXX this doesn't match plugins derived from HTMLPlug (except ConvertTo)
    10841089    if (ref($self) eq 'HTMLPlug' ||
    10851090    (exists $self->{'converted_to'} && $self->{'converted_to'} eq 'HTML')){
     1091
     1092    # remove <title>stuff</title> -- as titles tend often to be in English
     1093    # for foreign language documents
     1094    $text =~ s!<title>.*?</title>!!si;
     1095
     1096    # see if this html file specifies its encoding
     1097    if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
     1098        $best_encoding = $1;
     1099    } elsif ($text =~ /<meta http-equiv.*content-type.*charset=(.+?)"/i) {
     1100        $best_encoding = $1;
     1101    }
     1102    if ($best_encoding) { # we extracted an encoding
     1103        $best_encoding =~ s/-+/_/g;
     1104        $best_encoding = lc($best_encoding); # lowercase
     1105        if ($best_encoding eq "utf_8") { $best_encoding = "utf8" }
     1106        $self->{'input_encoding'} = $best_encoding;
     1107    }
     1108       
     1109    # remove all HTML tags
    10861110    $text =~ s/<[^>]*>//sg;
    10871111    }
     
    10931117    # first one in the list - otherwise use the defaults
    10941118    if (scalar @$results > 3) {
    1095     my $best_encoding="";
    10961119    if ($unicode_format) { # in case the first had a BOM
    10971120        $best_encoding=$unicode_format;
     
    11221145             $filename, $self->{'default_language'});
    11231146        }       
    1124         return ($self->{'default_language'}, $self->{'input_encoding'});
     1147        $best_language = $self->{'default_language'};
     1148        $best_encoding = $self->{'input_encoding'};
    11251149
    11261150    } else {
     
    11301154             $filename, $self->{'default_language'});
    11311155        }
    1132         return ($self->{'default_language'}, $best_encoding);
    1133     }
    1134     }
    1135 
    1136     # format language/encoding
    1137     my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
    1138     if (!defined $language) {
    1139     if ($self->{'verbosity'}>2) {
    1140         gsprintf($outhandle,
    1141              "BasPlug: {BasPlug.could_not_extract_language}\n",
    1142              $filename, $self->{'default_language'});
    1143     }
    1144     $language = $self->{'default_language'};
    1145     }
    1146     if (!defined $encoding) {
    1147     if ($self->{'verbosity'}>2) {
    1148         gsprintf($outhandle,
    1149              "BasPlug: {BasPlug.could_not_extract_encoding}\n",
    1150              $filename, $self->{'default_encoding'});
    1151     }
    1152     $encoding = $self->{'default_encoding'};
    1153     }
    1154 
     1156        $best_language = $self->{'default_language'};
     1157    }
     1158    } else { # <= 3 suggestions
     1159    my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
     1160    if (!defined $language) {
     1161        if ($self->{'verbosity'}>2) {
     1162        gsprintf($outhandle,
     1163            "BasPlug: {BasPlug.could_not_extract_language}\n",
     1164            $filename, $self->{'default_language'});
     1165        }
     1166        $language = $self->{'default_language'};
     1167    }
     1168    if (!defined $encoding) {
     1169        if ($self->{'verbosity'}>2) {
     1170        gsprintf($outhandle,
     1171            "BasPlug: {BasPlug.could_not_extract_encoding}\n",
     1172            $filename, $self->{'default_encoding'});
     1173        }
     1174        $encoding = $self->{'default_encoding'};
     1175    }
     1176    $best_language = $language;
     1177    if (! $best_encoding ) { # may already be set... eg from html meta tag
     1178        $best_encoding = $encoding;
     1179    }
     1180    }
     1181
     1182    my $text_copy = $text;
     1183    if ($best_encoding =~ /^iso_8859/ && unicode::ensure_utf8(\$text_copy)==0) {
     1184    # the text is valid utf8, so assume that's the real encoding
     1185    # (since textcat is based on probabilities)
     1186    $best_encoding = 'utf8';
     1187    }
    11551188
    11561189    # check for equivalents where textcat doesn't have some encodings...
    11571190    # eg MS versions of standard encodings
    1158     if ($encoding =~ /^iso_8859_(\d+)/) {
     1191    if ($best_encoding =~ /^iso_8859_(\d+)/) {
    11591192    my $iso = $1; # which variant of the iso standard?
    11601193    # iso-8859 sets don't use chars 0x80-0x9f, windows codepages do
    11611194    if ($text =~ /[\x80-\x9f]/) {
    11621195        # Western Europe
    1163         if ($iso == 1 or $iso == 15) { $encoding = 'windows_1252' }
    1164         elsif ($iso == 2) { $encoding = 'windows_1250' } # Central Europe
    1165         elsif ($iso == 5) { $encoding = 'windows_1251' } # Cyrillic
    1166         elsif ($iso == 6) { $encoding = 'windows_1256' } # Arabic
    1167         elsif ($iso == 7) { $encoding = 'windows_1253' } # Greek
    1168         elsif ($iso == 8) { $encoding = 'windows_1255' } # Hebrew
    1169         elsif ($iso == 9) { $encoding = 'windows_1254' } # Turkish
    1170     }
    1171     }
    1172 
    1173     if ($encoding !~ /^(ascii|utf8|unicode)$/ &&
    1174     !defined $encodings::encodings->{$encoding}) {
     1196        if ($iso == 1 or $iso == 15) { $best_encoding = 'windows_1252' }
     1197        elsif ($iso == 2) {$best_encoding = 'windows_1250'} # Central Europe
     1198        elsif ($iso == 5) {$best_encoding = 'windows_1251'} # Cyrillic
     1199        elsif ($iso == 6) {$best_encoding = 'windows_1256'} # Arabic
     1200        elsif ($iso == 7) {$best_encoding = 'windows_1253'} # Greek
     1201        elsif ($iso == 8) {$best_encoding = 'windows_1255'} # Hebrew
     1202        elsif ($iso == 9) {$best_encoding = 'windows_1254'} # Turkish
     1203    }
     1204    }
     1205
     1206    if ($best_encoding !~ /^(ascii|utf8|unicode)$/ &&
     1207    !defined $encodings::encodings->{$best_encoding}) {
    11751208    if ($self->{'verbosity'}) {
    11761209        gsprintf($outhandle, "BasPlug: {BasPlug.unsupported_encoding}\n",
    1177              $filename, $encoding, $self->{'default_encoding'});
    1178     }
    1179     $encoding = $self->{'default_encoding'};
    1180     }
    1181 
    1182     return ($language, $encoding);
     1210             $filename, $best_encoding, $self->{'default_encoding'});
     1211    }
     1212    $best_encoding = $self->{'default_encoding'};
     1213    }
     1214
     1215    return ($best_language, $best_encoding);
    11831216}
    11841217
Note: See TracChangeset for help on using the changeset viewer.