Changeset 16724


Ignore:
Timestamp:
2008-08-12T14:54:31+12:00 (13 years ago)
Author:
ak19
Message:
  1. Dr Bainbridge added some language-encoding related methods that work with multiread.pm; 2. When checking for encoding specified in head tag (meta http-equiv) it checks whether this meta tag is the first item nested in a comment in which case it is ignored. What about containing comments that do not contain meta http-equiv as the first element? 3. textcat calls classify_cached_filecontents which stores the fileencoding of the file
File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/ReadTextFile.pm

    r16699 r16724  
    139139    $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});   
    140140    $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}");
    141    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
     141    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path));
    142142    $self->set_Source_metadata($doc_obj, $filename_no_path, $encoding);
    143143
     
    225225
    226226
     227sub read_file_no_decoding {
     228    my $self = shift (@_);
     229    my ($filename, $textref) = @_;
     230
     231    if (!-r $filename)
     232    {
     233    my $outhandle = $self->{'outhandle'};
     234    gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
     235    # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
     236    return;
     237    }
     238    $$textref = "";
     239    if (!open (FILE, $filename)) {
     240    gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename);
     241    die "\n";
     242    }
     243     
     244    my $reader = new multiread();
     245    $reader->set_handle ('ReadTextFile::FILE');
     246    $reader->read_file_no_decoding ($textref);
     247   
     248    $self->{'reader'} = $reader;
     249
     250    close FILE;
     251}
     252
     253
     254sub decode_text {
     255    my $self = shift (@_);
     256    my ($raw_text, $encoding, $language, $textref) = @_;
     257
     258    my $reader = $self->{'reader'};
     259    if (!defined $reader) {
     260    gsprintf(STDERR, "ReadTextFile::decode_text needs to call ReadTextFile::read_file_no_decoding first\n");
     261    }
     262    else {
     263    $reader->set_encoding($encoding);
     264    $reader->decode_text($raw_text,$textref);
     265    }
     266}
     267
     268
    227269sub textcat_get_language_encoding {
    228270    my $self = shift (@_);
    229271    my ($filename) = @_;
    230272
    231    
    232273    my ($language, $encoding, $extracted_encoding);
    233274    if ($self->{'input_encoding'} eq "auto") {
     
    235276        ($language, $encoding) = $self->get_language_encoding ($filename);
    236277    } elsif ($self->{'extract_language'}) {
    237         # use textcat to get language metadata
     278    # use textcat to get language metadata
    238279        ($language, $extracted_encoding) = $self->get_language_encoding ($filename);
    239280        $encoding = $self->{'input_encoding'};
     
    241282    # to english in iso-8859-1 (except for some punctuation). We don't have
    242283    # a language model for en_utf8, so textcat always says iso-8859-1!
    243         if ($extracted_encoding ne $encoding && $language ne "en"
    244         && $self->{'verbosity'}) {
     284        if ($extracted_encoding ne $encoding && $language ne "en" && $self->{'verbosity'}) {
    245285        my $plugin_name = ref ($self);
    246286        my $outhandle = $self->{'outhandle'};
     
    249289    } else {
    250290        $language = $self->{'default_language'};
    251         $encoding = $self->{'input_encoding'};
    252     }
     291        $encoding = $self->{'input_encoding'};
     292    }
     293   
     294#    print STDERR "**** language encoding of contents of file $filename:\n\t****$language $encoding\n";
    253295
    254296    return ($language, $encoding);
     
    305347    if ($text =~ /^<\?xml.*encoding="(.+?)"/) {
    306348        $best_encoding = $1;
    307     } elsif ($text =~ /<meta http-equiv.*content-type.*charset=(.+?)"/i) {#"
     349    }
     350    # check the meta http-equiv charset tag unless it is commented out
     351    elsif (($text !~ /<!--[^<>]?<meta http-equiv/i) && ($text =~ /<meta http-equiv.*content-type.*charset=(.+?)\"/i)) {           
    308352        $best_encoding = $1;
     353#       print STDERR "**** meta tag found, encoding is: $best_encoding\n";
    309354    }
    310355    if ($best_encoding) { # we extracted an encoding
     
    323368    # get the language/encoding
    324369    $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'}));
    325     my $results = $self->{'textcat'}->classify(\$text);
     370#    my $results = $self->{'textcat'}->classify(\$text);
     371    my $results = $self->{'textcat'}->classify_cached_filecontents(\$text, $filename);
    326372
    327373    # if textcat returns 3 or less possibilities we'll use the
Note: See TracChangeset for help on using the changeset viewer.