Changeset 22841

Show
Ignore:
Timestamp:
02.09.2010 14:20:00 (9 years ago)
Author:
davidb
Message:

More explicit use of utf8 for input and output file handling. Relies on strings in Perl being Unicode aware (and not merely binary bytes) otherwise binary bytes will then be incorrectly re-incoded as UTF-8 (which is not what you want as they already are in UTF-8 form). In the case of this plugin, both text and metadata (read from doc.xml using XML::Parser) is is binary byte format, and so needs to be decoded before being added into $doc_obj->add_utf8_...l

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/GreenstoneXMLPlugin.pm

    r17745 r22841  
     1 
    12########################################################################### 
    23# 
     
    3132package GreenstoneXMLPlugin; 
    3233 
     34use Encode; 
     35 
    3336use ReadXMLFile; 
    3437 
     
    3942    @GreenstoneXMLPlugin::ISA = ('ReadXMLFile'); 
    4043} 
     44 
     45 
    4146 
    4247 
     
    143148    } 
    144149    elsif ($element eq "Metadata") { 
    145     $self->{'doc_obj'}->add_utf8_metadata($self->{'section'}, $self->{'metadata_name'},$self->{'metadata_value'}); 
     150    # text read in by XML::Parser is in Perl's binary byte value 
     151    # form ... need to explicitly make it UTF-8 
     152 
     153    my $metadata_name = decode("utf-8",$self->{'metadata_name'}); 
     154    my $metadata_value = decode("utf-8",$self->{'metadata_value'}); 
     155 
     156    $self->{'doc_obj'}->add_utf8_metadata($self->{'section'},  
     157                          $metadata_name,$metadata_value); 
     158 
    146159        # Ensure this value is added to the allvalues database in gseditor. 
    147160        # Note that the database constraints prevent multiple occurances of the 
     
    182195    } 
    183196    elsif ($element eq "Content" && $self->{'content'} ne "") { 
    184     $self->{'doc_obj'}->add_utf8_text($self->{'section'}, $self->{'content'}); 
     197 
     198    # text read in by XML::Parser is in Perl's binary byte value 
     199    # form ... need to explicitly make it UTF-8 
     200    my $content = decode("utf-8",$self->{'content'}); 
     201 
     202    $self->{'doc_obj'}->add_utf8_text($self->{'section'}, $content); 
    185203    $self->{'content'} = ""; 
    186204    }