Changeset 22841


Ignore:
Timestamp:
2010-09-02T14:20:00+12:00 (14 years ago)
Author:
davidb
Message:

More explicit use of utf8 for input and output file handling. Relies on strings in Perl being Unicode aware (and not merely binary bytes) otherwise binary bytes will then be incorrectly re-incoded as UTF-8 (which is not what you want as they already are in UTF-8 form). In the case of this plugin, both text and metadata (read from doc.xml using XML::Parser) is is binary byte format, and so needs to be decoded before being added into $doc_obj->add_utf8_...l

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/GreenstoneXMLPlugin.pm

    r17745 r22841  
     1
    12###########################################################################
    23#
     
    3132package GreenstoneXMLPlugin;
    3233
     34use Encode;
     35
    3336use ReadXMLFile;
    3437
     
    3942    @GreenstoneXMLPlugin::ISA = ('ReadXMLFile');
    4043}
     44
     45
    4146
    4247
     
    143148    }
    144149    elsif ($element eq "Metadata") {
    145     $self->{'doc_obj'}->add_utf8_metadata($self->{'section'}, $self->{'metadata_name'},$self->{'metadata_value'});
     150    # text read in by XML::Parser is in Perl's binary byte value
     151    # form ... need to explicitly make it UTF-8
     152
     153    my $metadata_name = decode("utf-8",$self->{'metadata_name'});
     154    my $metadata_value = decode("utf-8",$self->{'metadata_value'});
     155
     156    $self->{'doc_obj'}->add_utf8_metadata($self->{'section'},
     157                          $metadata_name,$metadata_value);
     158
    146159        # Ensure this value is added to the allvalues database in gseditor.
    147160        # Note that the database constraints prevent multiple occurances of the
     
    182195    }
    183196    elsif ($element eq "Content" && $self->{'content'} ne "") {
    184     $self->{'doc_obj'}->add_utf8_text($self->{'section'}, $self->{'content'});
     197
     198    # text read in by XML::Parser is in Perl's binary byte value
     199    # form ... need to explicitly make it UTF-8
     200    my $content = decode("utf-8",$self->{'content'});
     201
     202    $self->{'doc_obj'}->add_utf8_text($self->{'section'}, $content);
    185203    $self->{'content'} = "";
    186204    }
Note: See TracChangeset for help on using the changeset viewer.