Changeset 16724

Show
Ignore:
Timestamp:
12.08.2008 14:54:31 (11 years ago)
Author:
ak19
Message:

1. Dr Bainbridge added some language-encoding related methods that work with multiread.pm; 2. When checking for encoding specified in head tag (meta http-equiv) it checks whether this meta tag is the first item nested in a comment in which case it is ignored. What about containing comments that do not contain meta http-equiv as the first element? 3. textcat calls classify_cached_filecontents which stores the fileencoding of the file

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/plugins/ReadTextFile.pm

    r16699 r16724  
    139139    $doc_obj->set_OIDtype ($processor->{'OIDtype'}, $processor->{'OIDmetadata'});     
    140140    $doc_obj->add_utf8_metadata($top_section, "Plugin", "$self->{'plugin_type'}"); 
    141    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path)); 
     141    $doc_obj->add_utf8_metadata($top_section, "FileSize", (-s $filename_full_path)); 
    142142    $self->set_Source_metadata($doc_obj, $filename_no_path, $encoding); 
    143143 
     
    225225 
    226226 
     227sub read_file_no_decoding { 
     228    my $self = shift (@_); 
     229    my ($filename, $textref) = @_; 
     230 
     231    if (!-r $filename) 
     232    { 
     233    my $outhandle = $self->{'outhandle'}; 
     234    gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'}; 
     235    # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'}; 
     236    return; 
     237    } 
     238    $$textref = ""; 
     239    if (!open (FILE, $filename)) { 
     240    gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_for_reading} ($!)\n", $filename); 
     241    die "\n"; 
     242    } 
     243      
     244    my $reader = new multiread(); 
     245    $reader->set_handle ('ReadTextFile::FILE'); 
     246    $reader->read_file_no_decoding ($textref); 
     247     
     248    $self->{'reader'} = $reader; 
     249 
     250    close FILE; 
     251} 
     252 
     253 
     254sub decode_text { 
     255    my $self = shift (@_); 
     256    my ($raw_text, $encoding, $language, $textref) = @_; 
     257 
     258    my $reader = $self->{'reader'}; 
     259    if (!defined $reader) { 
     260    gsprintf(STDERR, "ReadTextFile::decode_text needs to call ReadTextFile::read_file_no_decoding first\n"); 
     261    } 
     262    else { 
     263    $reader->set_encoding($encoding); 
     264    $reader->decode_text($raw_text,$textref); 
     265    } 
     266} 
     267 
     268 
    227269sub textcat_get_language_encoding { 
    228270    my $self = shift (@_); 
    229271    my ($filename) = @_; 
    230272 
    231      
    232273    my ($language, $encoding, $extracted_encoding); 
    233274    if ($self->{'input_encoding'} eq "auto") { 
     
    235276        ($language, $encoding) = $self->get_language_encoding ($filename); 
    236277    } elsif ($self->{'extract_language'}) { 
    237         # use textcat to get language metadata 
     278    # use textcat to get language metadata 
    238279        ($language, $extracted_encoding) = $self->get_language_encoding ($filename); 
    239280        $encoding = $self->{'input_encoding'}; 
     
    241282    # to english in iso-8859-1 (except for some punctuation). We don't have 
    242283    # a language model for en_utf8, so textcat always says iso-8859-1! 
    243         if ($extracted_encoding ne $encoding && $language ne "en" 
    244         && $self->{'verbosity'}) { 
     284        if ($extracted_encoding ne $encoding && $language ne "en" && $self->{'verbosity'}) { 
    245285        my $plugin_name = ref ($self); 
    246286        my $outhandle = $self->{'outhandle'}; 
     
    249289    } else { 
    250290        $language = $self->{'default_language'}; 
    251         $encoding = $self->{'input_encoding'}; 
    252     } 
     291        $encoding = $self->{'input_encoding'};  
     292    } 
     293     
     294#    print STDERR "**** language encoding of contents of file $filename:\n\t****$language $encoding\n"; 
    253295 
    254296    return ($language, $encoding); 
     
    305347    if ($text =~ /^<\?xml.*encoding="(.+?)"/) { 
    306348        $best_encoding = $1; 
    307     } elsif ($text =~ /<meta http-equiv.*content-type.*charset=(.+?)"/i) {#" 
     349    } 
     350    # check the meta http-equiv charset tag unless it is commented out 
     351    elsif (($text !~ /<!--[^<>]?<meta http-equiv/i) && ($text =~ /<meta http-equiv.*content-type.*charset=(.+?)\"/i)) {             
    308352        $best_encoding = $1; 
     353#       print STDERR "**** meta tag found, encoding is: $best_encoding\n"; 
    309354    } 
    310355    if ($best_encoding) { # we extracted an encoding 
     
    323368    # get the language/encoding 
    324369    $self->{'textcat'} = new textcat() if (!defined($self->{'textcat'})); 
    325     my $results = $self->{'textcat'}->classify(\$text); 
     370#    my $results = $self->{'textcat'}->classify(\$text); 
     371    my $results = $self->{'textcat'}->classify_cached_filecontents(\$text, $filename); 
    326372 
    327373    # if textcat returns 3 or less possibilities we'll use the