Changeset 27106

Show
Ignore:
Timestamp:
21.03.2013 12:39:49 (6 years ago)
Author:
kjdon
Message:

need to do the same utf8 decode step that is used in ReadTextFile? on the text and metadata so that we get proper utf8 strings

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/MARCPlugin.pm

    r24547 r27106  
    2929use SplitTextFile; 
    3030use MetadataRead; 
     31 
     32use Encode; 
    3133 
    3234use unicode; 
     
    179181    if ($encoding eq "utf8") { 
    180182    # nothing needs to be done 
    181     return $line; 
    182     } 
    183  
    184     if ($encoding eq "iso_8859_1") { 
     183    #return $line; 
     184    } elsif ($encoding eq "iso_8859_1") { 
    185185    # we'll use ascii2utf8() for this as it's faster than going 
    186186    # through convert2unicode() 
    187     return &unicode::ascii2utf8 (\$line); 
    188     } 
     187    #return &unicode::ascii2utf8 (\$line); 
     188    $line = &unicode::ascii2utf8 (\$line); 
     189    } else { 
    189190 
    190191    # everything else uses unicode::convert2unicode 
    191     return &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line)); 
     192    $line = &unicode::unicode2utf8 (&unicode::convert2unicode ($encoding, \$line)); 
     193    } 
     194    # At this point $line is a binary byte string 
     195    # => turn it into a Unicode aware string, so full 
     196    # Unicode aware pattern matching can be used. 
     197    # For instance: 's/\x{0101}//g' or '[[:upper:]]' 
     198 
     199    return decode ("utf8", $line); 
    192200} 
    193201 
     
    266274 
    267275    my $encoding = $self->{'readfile_encoding'}->{$filename}; 
    268  
    269276    if (defined ($self->{'metadata_mapping'}) ) { 
    270277    if ($self->{'type'} ne "ascii" ){