Changeset 22840

Show
Ignore:
Timestamp:
02.09.2010 14:18:47 (9 years ago)
Author:
davidb
Message:

More explicit use of utf8 for input and output file handling. Relies on strings in Perl being Unicode aware (and not merely binary bytes) otherwise binary bytes will then be incorrectly re-incoded as UTF-8 (which is not what you want as they already are in UTF-8 form). In the case of this plugin, text that comes in from doctxt.xml (read with XPATH) is Unicode aware (can be added to $doc_obj directly with add_utf8_....). In the case of the metadata (read from docmets.xml using XML::Parser) is is binary byte format, and so needs to be decoded before being added into $doc_obj->add_utf8_...l

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/GreenstoneMETSPlugin.pm

    r21803 r22840  
    3232package GreenstoneMETSPlugin; 
    3333 
     34use Encode; 
    3435use ghtml; 
    3536 
     
    4546} 
    4647 
    47 my $arguments = [ 
    48          ]; 
     48my $arguments = [ { 'name' => "process_exp", 
     49            'desc' => "{BasePlugin.process_exp}", 
     50            'type' => "regexp", 
     51            'reqd' => "no", 
     52            'deft' => &get_default_process_exp() 
     53            } 
     54          ]; 
     55 
    4956my $options = { 'name'     => "GreenstoneMETSPlugin", 
    5057        'desc'     => "{GreenstoneMETSPlugin.desc}", 
    5158        'abstract' => "no", 
    52         'inherits' => "yes" }; 
     59        'inherits' => "yes", 
     60        'args'     => $arguments }; 
     61  
    5362 
    5463 
     
    100109    $filename =~ s/docmets.xml$/doctxt.xml/; 
    101110    
    102     if (!open (FILEIN,"<$filename")){ 
     111    if (!open (FILEIN, "<:utf8", $filename)) { 
    103112    print STDERR "Warning: unable to open the $filename\n"; 
    104113    $self->{'xmltxt'} = ""; 
     
    111120        } 
    112121    } 
     122 
    113123        my $xml_parser = XML::XPath->new (xml=> $xml_text); 
    114124    #my $xml_tree = $xml_parser->parse ($xml_text); 
     
    222232     
    223233    foreach my $md_pair (@$md_list){ 
    224         my $metadata_name = $md_pair->{'metadata_name'}; 
    225         my $metadata_value = $md_pair->{'metadata_value'}; 
    226         $self->{'doc_obj'}->add_utf8_metadata($self->{'section'}, $metadata_name, $metadata_value); 
     234        # text read in by XML::Parser is in Perl's binary byte value 
     235        # form ... need to explicitly make it UTF-8 
     236 
     237        my $metadata_name = decode("utf8",$md_pair->{'metadata_name'}); 
     238        my $metadata_value = decode("utf8",$md_pair->{'metadata_value'}); 
     239 
     240        $self->{'doc_obj'}->add_utf8_metadata($self->{'section'},  
     241                          $metadata_name, $metadata_value); 
    227242    } 
    228243     
     
    231246     
    232247    foreach my $section_content (@$content_list){ 
     248        # Don't need to decode $content as this has been readin in 
     249        # through XPath which (unlike XML::Parser) correctly sets 
     250        # the string to be UTF8 rather than a 'binary' string of bytes 
    233251        my $content = $section_content->{'section_content'}; 
     252 
    234253        $self->{'doc_obj'}->add_utf8_text($self->{'section'},$content); 
    235254    }