Changeset 9410


Ignore:
Timestamp:
2005-03-14T13:35:37+13:00 (19 years ago)
Author:
jrm21
Message:

do unicode endian detection if the encoding is 'unicode', rather than the
very unintuitive 'utf8'.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/multiread.pm

    r3834 r9410  
    5151# not read yet
    5252sub set_handle {
    53     my $self = shift (@_);
    54     ($self->{'handle'}) = @_;
     53    my $self = shift;
     54    $self->{'handle'} = shift;
     55    binmode( $self->{'handle'} );
    5556    $self->{'first'} = 1;
    5657    $self->{'encoding'} = "utf8";
     
    6061# set_encoding should be called after set_handle
    6162sub set_encoding {
    62     my $self = shift (@_);
    63     ($self->{'encoding'}) = @_;
     63    my $self = shift;
     64    my $enc = shift;
     65    if ($enc eq 'unicode_le') {
     66    $enc='unicode';
     67    $self->{'bigendian'} = 0;
     68    }
     69    $self->{'encoding'} = $enc;
    6470}
    6571
     
    7177# undef will be returned if the eof has been reached
    7278# the result will always be returned in utf-8
    73 # if automatic detection between utf8 and unicode is desired
    74 # then the encoding should be initially set to utf8
     79
    7580sub read_unicode_char {
    7681    my $self = shift (@_);
     
    7984    return undef if ($self->{'handle'} eq "");
    8085    my $handle = $self->{'handle'};
    81     binmode ($handle);
    8286
    8387    if ($self->{'encoding'} eq "utf8") {
     
    9094    while (!eof ($handle)) {
    9195        $c1 = ord (getc ($handle));
    92         if ($self->{'first'}) {
    93         $self->{'first'} = 0;
    94 
    95         if ($c1 == 0xfe || $c1 == 0xff) {
    96             $c2 = ord (getc ($handle)) if (!eof ($handle));
    97            
    98             # if unicode fall through to the unicode reading code
    99             if ($c1 == 0xff && $c2 == 0xfe) {
    100             $self->{'encoding'} = "unicode";
    101             $self->{'bigendian'} = 0;
    102             last;
    103 
    104             } elsif ($c1 == 0xfe && $c2 == 0xff) {
    105             $self->{'encoding'} = "unicode";
    106             $self->{'bigendian'} = 1;
    107             last;
    108             }
    109 
    110             # an error, but we might be able to recover
    111             # from it
    112             $c1 = $c2;
    113         }
    114         }
    11596
    11697        if ($c1 <= 0x7f) {
     
    161142
    162143    my $handle = $self->{'handle'};
    163 
    164     if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
    165     # special case for the first line of utf8 text to detect whether
    166     # the file is in utf8 or unicode
    167     my $out = "";
    168     my $thisc = "";
    169     while (defined ($thisc = $self->read_unicode_char())) {
    170         $out .= $thisc;
    171         last if ($thisc eq "\n");
    172     }
    173    
    174     return $out if (length ($out) > 0);
    175     return undef;
    176     }
    177144
    178145    if ($self->{'encoding'} eq "utf8") {
     
    217184
    218185
     186
     187# this will look for a Byte Order Marker at the start of the file, and
     188# set the encoding appropriately if there is one, returning any
     189# non-marker text on the first line (or returns undef).
     190sub find_unicode_bom {
     191    my $self=shift;
     192
     193    my $non_bom_text=""; # to return if we read in 'real' text
     194
     195    if ($self->{'first'} == 0) { return }
     196
     197    # make sure we have a file handle
     198    return if ($self->{'handle'} eq "");
     199    my $handle = $self->{'handle'};
     200
     201    $self->{'first'} = 0;
     202
     203    my $b1 = ord(getc ($handle));
     204    my $b2;
     205    my $b3;
     206
     207    if ($b1 == 0xfe || $b1 == 0xff) {
     208    $b2 = ord (getc ($handle)) if (!eof ($handle));
     209    if ($b1 == 0xff && $b2 == 0xfe) {
     210        $self->{'encoding'} = "unicode";
     211        $self->{'bigendian'} = 0;
     212        return;
     213    } elsif ($b1 == 0xfe && $b2 == 0xff) {
     214        $self->{'encoding'} = "unicode";
     215        $self->{'bigendian'} = 1;
     216        return;
     217    } elsif ($b1 == 0xef && $b2 == 0xbb) {
     218        $b3 = ord(getc($handle));
     219        if ($b3 == 0xbf) {
     220        $self->{'encoding'} = "utf8";
     221        $self->{'bigendian'} = 1;
     222        return;
     223        }
     224    }
     225    } else { # $b1 != fe or ff
     226    $handle->ungetc($b1); return;
     227    }
     228    # if here, we have removed some chars and they aren't a BOM
     229    if ($self->{'encoding'} eq "unicode") { # return the 2byte char
     230    if (defined ($b3)) { # we looked at this... return it
     231        $handle->ungetc($b3);
     232    }
     233    return &unicode::unicode2utf8([$self->{'bigendian'}?
     234                       ($b1*256+$b2) : ($b2*256+$b1)]);
     235    }
     236    # if here, it's utf-8
     237    if ($b2 < 0x80) {
     238    if (defined ($b3)) { # we grabbed this, but don't need it now
     239        $handle->ungetc($b3);
     240    }
     241    return ($b1 . $b2);
     242    }
     243    # if here, we have taken part of a multi-byte char. we need to make
     244    # sure we return the entire character
     245    if (defined($b3) && $b3 < 0x80) { # we have all we need
     246    $handle->ungetc($b3);
     247    return ($b1 . $b2);
     248    }
     249    my $c=$b1.$b2.$b3;
     250    my $b4=$handle->getc();
     251    while ($b4 > 0x7f) { # note - this will return consecutive mb utf8 chars
     252    $c .= $b4;
     253    $b4=$handle->getc();
     254    if (eof($handle)) { last }
     255    }
     256    if (! eof($handle)) {
     257    $handle->ungetc($b4); # this byte is an ascii byte
     258    }
     259    return $c;
     260}
     261
     262
     263
    219264# will convert entire contents of file to utf8 and append result to $outputref
    220265# this may be a slightly faster way to get the contents of a file than by
     
    229274    my $handle = $self->{'handle'};
    230275
    231     if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
    232     # special case for the first line of utf8 text to detect whether
    233     # the file is in utf8 or unicode
    234    
    235     # possible to have no text here...
    236     my $read_text = $self->read_line ();
     276    # if encoding is set to utf8 or unicode, sniff to see if there is a
     277    # byte order marker
     278    if ($self->{'first'} &&
     279    ($self->{'encoding'} eq "utf8" || $self->{'encoding'} eq 'unicode')) {
     280    # this will change $self's encoding if there is a BOM
     281    my $read_text = $self->find_unicode_bom();
    237282    $$outputref .= $read_text if (defined($read_text));
    238283    }
Note: See TracChangeset for help on using the changeset viewer.