Changeset 27106 for main/trunk


Ignore:
Timestamp:
2013-03-21T12:39:49+13:00 (11 years ago)
Author:
kjdon
Message:

need to do the same utf8 decode step that is used in ReadTextFile on the text and metadata so that we get proper utf8 strings

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/MARCPlugin.pm

    r24547 r27106  
    2929use SplitTextFile;
    3030use MetadataRead;
     31
     32use Encode;
    3133
    3234use unicode;
     
    179181    if ($encoding eq "utf8") {
    180182    # nothing needs to be done
    181     return $line;
    182     }
    183 
    184     if ($encoding eq "iso_8859_1") {
     183    #return $line;
     184    } elsif ($encoding eq "iso_8859_1") {
    185185    # we'll use ascii2utf8() for this as it's faster than going
    186186    # through convert2unicode()
    187     return &unicode::ascii2utf8 (\$line);
    188     }
     187    #return &unicode::ascii2utf8 (\$line);
     188    $line = &unicode::ascii2utf8 (\$line);
     189    } else {
    189190
    190191    # everything else uses unicode::convert2unicode
    191     return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
     192    $line = &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line));
     193    }
     194    # At this point $line is a binary byte string
     195    # => turn it into a Unicode aware string, so full
     196    # Unicode aware pattern matching can be used.
     197    # For instance: 's/\x{0101}//g' or '[[:upper:]]'
     198
     199    return decode ("utf8", $line);
    192200}
    193201
     
    266274
    267275    my $encoding = $self->{'readfile_encoding'}->{$filename};
    268 
    269276    if (defined ($self->{'metadata_mapping'}) ) {
    270277    if ($self->{'type'} ne "ascii" ){   
Note: See TracChangeset for help on using the changeset viewer.