Changeset 22844

Show
Ignore:
Timestamp:
02.09.2010 14:32:53 (9 years ago)
Author:
davidb
Message:

More explicit use of utf8 for input and output file handling. Relies on strings in Perl being Unicode aware (and not merely binary bytes) otherwise binary bytes will then be incorrectly re-incoded as UTF-8 (which is not what you want as they already are in UTF-8 form). Decoding the text read in at this point to UTF-8 means that (subsequent) regular expression processing of the data can make us Unicode aware operations, such as what Unicode classes as punctuation

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm

    r18320 r22844  
    3030no strict 'refs'; # allow filehandles to be variables and viceversa 
    3131 
     32use Encode; 
    3233 
    3334use multiread; 
     
    220221    $reader->read_file ($textref); 
    221222    } 
     223 
     224    # At this point $$testref is a binary byte string 
     225    # => turn it into a Unicode aware string, so full 
     226    # Unicode aware pattern matching can be used. 
     227    # For instance: 's/\x{0101}//g' or '[[:upper:]]' 
     228    #  
     229 
     230    $$textref = decode("utf8",$$textref); 
     231 
     232    close FILE; 
     233} 
     234 
     235 
     236# Not currently used 
     237sub read_file_usingPerlsEncodeModule { 
     238    my $self = shift (@_); 
     239    my ($filename, $encoding, $language, $textref) = @_; 
     240 
     241    if (!-r $filename) 
     242    { 
     243        my $outhandle = $self->{'outhandle'}; 
     244        gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'}; 
     245        # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'}; 
     246        return; 
     247    } 
     248    $$textref = ""; 
     249    if (!open (FILE, $filename)) { 
     250        gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_f 
     251or_reading} ($!)\n", $filename); 
     252        die "\n"; 
     253    } 
     254 
     255    my $store_slash = $/; 
     256    undef $/; 
     257    my $text = <FILE>; 
     258    $/ = $store_slash; 
     259 
     260    $$textref = decode($encoding,$text); 
     261 
    222262    close FILE; 
    223263}