Context Navigation

← Previous Changeset
Next Changeset →

Changeset 9410

Timestamp:

2005-03-14T13:35:37+13:00 (19 years ago)

Author:

jrm21

Message:

do unicode endian detection if the encoding is 'unicode', rather than the
very unintuitive 'utf8'.

File:

: 1 edited

trunk/gsdl/perllib/multiread.pm (modified) (8 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/multiread.pm

-              r3834
+              r9410
 # not read yet
 sub set_handle {
+    my $self = shift (@_);
+    ($self->{'handle'}) = @_;
+    my $self = shift;
+    $self->{'handle'} = shift;
+    binmode( $self->{'handle'} );
     $self->{'first'} = 1;
     $self->{'encoding'} = "utf8";
 …
 # set_encoding should be called after set_handle
 sub set_encoding {
+    my $self = shift (@_);
+    ($self->{'encoding'}) = @_;
+    my $self = shift;
+    my $enc = shift;
+    if ($enc eq 'unicode_le') {
+    $enc='unicode';
+    $self->{'bigendian'} = 0;
+    }
+    $self->{'encoding'} = $enc;
+}
 …
 # undef will be returned if the eof has been reached
 # the result will always be returned in utf-8
+# if automatic detection between utf8 and unicode is desired
+# then the encoding should be initially set to utf8
 sub read_unicode_char {
     my $self = shift (@_);
 …
     return undef if ($self->{'handle'} eq "");
     my $handle = $self->{'handle'};
-    binmode ($handle);
     if ($self->{'encoding'} eq "utf8") {
 …
     while (!eof ($handle)) {
         $c1 = ord (getc ($handle));
-        if ($self->{'first'}) {
-        $self->{'first'} = 0;
-        if ($c1 == 0xfe || $c1 == 0xff) {
-            $c2 = ord (getc ($handle)) if (!eof ($handle));
-            # if unicode fall through to the unicode reading code
-            if ($c1 == 0xff && $c2 == 0xfe) {
-            $self->{'encoding'} = "unicode";
-            $self->{'bigendian'} = 0;
-            last;
-            } elsif ($c1 == 0xfe && $c2 == 0xff) {
-            $self->{'encoding'} = "unicode";
-            $self->{'bigendian'} = 1;
-            last;
+            }
-            # an error, but we might be able to recover
-            # from it
-            $c1 = $c2;
+        }
+        }
         if ($c1 <= 0x7f) {
 …
     my $handle = $self->{'handle'};
-    if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
-    # special case for the first line of utf8 text to detect whether
-    # the file is in utf8 or unicode
-    my $out = "";
-    my $thisc = "";
-    while (defined ($thisc = $self->read_unicode_char())) {
-        $out .= $thisc;
-        last if ($thisc eq "\n");
+    }
-    return $out if (length ($out) > 0);
-    return undef;
+    }
     if ($self->{'encoding'} eq "utf8") {
 …
+# this will look for a Byte Order Marker at the start of the file, and
+# set the encoding appropriately if there is one, returning any
+# non-marker text on the first line (or returns undef).
+sub find_unicode_bom {
+    my $self=shift;
+    my $non_bom_text=""; # to return if we read in 'real' text
+    if ($self->{'first'} == 0) { return }
+    # make sure we have a file handle
+    return if ($self->{'handle'} eq "");
+    my $handle = $self->{'handle'};
+    $self->{'first'} = 0;
+    my $b1 = ord(getc ($handle));
+    my $b2;
+    my $b3;
+    if ($b1 == 0xfe || $b1 == 0xff) {
+    $b2 = ord (getc ($handle)) if (!eof ($handle));
+    if ($b1 == 0xff && $b2 == 0xfe) {
+        $self->{'encoding'} = "unicode";
+        $self->{'bigendian'} = 0;
+        return;
+    } elsif ($b1 == 0xfe && $b2 == 0xff) {
+        $self->{'encoding'} = "unicode";
+        $self->{'bigendian'} = 1;
+        return;
+    } elsif ($b1 == 0xef && $b2 == 0xbb) {
+        $b3 = ord(getc($handle));
+        if ($b3 == 0xbf) {
+        $self->{'encoding'} = "utf8";
+        $self->{'bigendian'} = 1;
+        return;
+        }
+    }
+    } else { # $b1 != fe or ff
+    $handle->ungetc($b1); return;
+    }
+    # if here, we have removed some chars and they aren't a BOM
+    if ($self->{'encoding'} eq "unicode") { # return the 2byte char
+    if (defined ($b3)) { # we looked at this... return it
+        $handle->ungetc($b3);
+    }
+    return &unicode::unicode2utf8([$self->{'bigendian'}?
+                       ($b1*256+$b2) : ($b2*256+$b1)]);
+    }
+    # if here, it's utf-8
+    if ($b2 < 0x80) {
+    if (defined ($b3)) { # we grabbed this, but don't need it now
+        $handle->ungetc($b3);
+    }
+    return ($b1 . $b2);
+    }
+    # if here, we have taken part of a multi-byte char. we need to make
+    # sure we return the entire character
+    if (defined($b3) && $b3 < 0x80) { # we have all we need
+    $handle->ungetc($b3);
+    return ($b1 . $b2);
+    }
+    my $c=$b1.$b2.$b3;
+    my $b4=$handle->getc();
+    while ($b4 > 0x7f) { # note - this will return consecutive mb utf8 chars
+    $c .= $b4;
+    $b4=$handle->getc();
+    if (eof($handle)) { last }
+    }
+    if (! eof($handle)) {
+    $handle->ungetc($b4); # this byte is an ascii byte
+    }
+    return $c;
+}
 # will convert entire contents of file to utf8 and append result to $outputref
 # this may be a slightly faster way to get the contents of a file than by
 …
     my $handle = $self->{'handle'};
     if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
     # special case for the first line of utf8 text to detect whether
+    # the file is in utf8 or unicode
     # possible to have no text here...
     my $read_text = $self->read_line ();
+    # if encoding is set to utf8 or unicode, sniff to see if there is a
+    # byte order marker
+    if ($self->{'first'} &&
+    ($self->{'encoding'} eq "utf8" || $self->{'encoding'} eq 'unicode')) {
+    # this will change $self's encoding if there is a BOM
+    my $read_text = $self->find_unicode_bom();
     $$outputref .= $read_text if (defined($read_text));
+    }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 9410

Legend:

trunk/gsdl/perllib/multiread.pm

Download in other formats: