Changeset 22844


Ignore:
Timestamp:
2010-09-02T14:32:53+12:00 (14 years ago)
Author:
davidb
Message:

More explicit use of utf8 for input and output file handling. Relies on strings in Perl being Unicode aware (and not merely binary bytes) otherwise binary bytes will then be incorrectly re-incoded as UTF-8 (which is not what you want as they already are in UTF-8 form). Decoding the text read in at this point to UTF-8 means that (subsequent) regular expression processing of the data can make us Unicode aware operations, such as what Unicode classes as punctuation

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/ReadTextFile.pm

    r18320 r22844  
    3030no strict 'refs'; # allow filehandles to be variables and viceversa
    3131
     32use Encode;
    3233
    3334use multiread;
     
    220221    $reader->read_file ($textref);
    221222    }
     223
     224    # At this point $$testref is a binary byte string
     225    # => turn it into a Unicode aware string, so full
     226    # Unicode aware pattern matching can be used.
     227    # For instance: 's/\x{0101}//g' or '[[:upper:]]'
     228    #
     229
     230    $$textref = decode("utf8",$$textref);
     231
     232    close FILE;
     233}
     234
     235
     236# Not currently used
     237sub read_file_usingPerlsEncodeModule {
     238    my $self = shift (@_);
     239    my ($filename, $encoding, $language, $textref) = @_;
     240
     241    if (!-r $filename)
     242    {
     243        my $outhandle = $self->{'outhandle'};
     244        gsprintf($outhandle, "{ReadTextFile.read_denied}\n", $filename) if $self->{'verbosity'};
     245        # print $outhandle "Read permission denied for $filename\n" if $self->{'verbosity'};
     246        return;
     247    }
     248    $$textref = "";
     249    if (!open (FILE, $filename)) {
     250        gsprintf(STDERR, "ReadTextFile::read_file {ReadTextFile.could_not_open_f
     251or_reading} ($!)\n", $filename);
     252        die "\n";
     253    }
     254
     255    my $store_slash = $/;
     256    undef $/;
     257    my $text = <FILE>;
     258    $/ = $store_slash;
     259
     260    $$textref = decode($encoding,$text);
     261
    222262    close FILE;
    223263}
Note: See TracChangeset for help on using the changeset viewer.