Changeset 22840


Ignore:
Timestamp:
2010-09-02T14:18:47+12:00 (14 years ago)
Author:
davidb
Message:

More explicit use of utf8 for input and output file handling. Relies on strings in Perl being Unicode aware (and not merely binary bytes) otherwise binary bytes will then be incorrectly re-incoded as UTF-8 (which is not what you want as they already are in UTF-8 form). In the case of this plugin, text that comes in from doctxt.xml (read with XPATH) is Unicode aware (can be added to $doc_obj directly with add_utf8_....). In the case of the metadata (read from docmets.xml using XML::Parser) is is binary byte format, and so needs to be decoded before being added into $doc_obj->add_utf8_...l

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/GreenstoneMETSPlugin.pm

    r21803 r22840  
    3232package GreenstoneMETSPlugin;
    3333
     34use Encode;
    3435use ghtml;
    3536
     
    4546}
    4647
    47 my $arguments = [
    48          ];
     48my $arguments = [ { 'name' => "process_exp",
     49            'desc' => "{BasePlugin.process_exp}",
     50            'type' => "regexp",
     51            'reqd' => "no",
     52            'deft' => &get_default_process_exp()
     53            }
     54          ];
     55
    4956my $options = { 'name'     => "GreenstoneMETSPlugin",
    5057        'desc'     => "{GreenstoneMETSPlugin.desc}",
    5158        'abstract' => "no",
    52         'inherits' => "yes" };
     59        'inherits' => "yes",
     60        'args'     => $arguments };
     61 
    5362
    5463
     
    100109    $filename =~ s/docmets.xml$/doctxt.xml/;
    101110   
    102     if (!open (FILEIN,"<$filename")){
     111    if (!open (FILEIN, "<:utf8", $filename)) {
    103112    print STDERR "Warning: unable to open the $filename\n";
    104113    $self->{'xmltxt'} = "";
     
    111120        }
    112121    }
     122
    113123        my $xml_parser = XML::XPath->new (xml=> $xml_text);
    114124    #my $xml_tree = $xml_parser->parse ($xml_text);
     
    222232   
    223233    foreach my $md_pair (@$md_list){
    224         my $metadata_name = $md_pair->{'metadata_name'};
    225         my $metadata_value = $md_pair->{'metadata_value'};
    226         $self->{'doc_obj'}->add_utf8_metadata($self->{'section'}, $metadata_name, $metadata_value);
     234        # text read in by XML::Parser is in Perl's binary byte value
     235        # form ... need to explicitly make it UTF-8
     236
     237        my $metadata_name = decode("utf8",$md_pair->{'metadata_name'});
     238        my $metadata_value = decode("utf8",$md_pair->{'metadata_value'});
     239
     240        $self->{'doc_obj'}->add_utf8_metadata($self->{'section'},
     241                          $metadata_name, $metadata_value);
    227242    }
    228243   
     
    231246   
    232247    foreach my $section_content (@$content_list){
     248        # Don't need to decode $content as this has been readin in
     249        # through XPath which (unlike XML::Parser) correctly sets
     250        # the string to be UTF8 rather than a 'binary' string of bytes
    233251        my $content = $section_content->{'section_content'};
     252
    234253        $self->{'doc_obj'}->add_utf8_text($self->{'section'},$content);
    235254    }
Note: See TracChangeset for help on using the changeset viewer.