Context Navigation

← Previous Changeset
Next Changeset →

Changeset 16719

Timestamp:

2008-08-12T14:04:13+12:00 (16 years ago)

Author:

ak19

Message:

Dr Bainbridge has changed several methods so that they can now be called to work on strings rather than having to always read strings in from a file

File:

: 1 edited

gsdl/trunk/perllib/multiread.pm (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

gsdl/trunk/perllib/multiread.pm

-              r16375
+              r16719
     return undef;
+}
+sub unicodechar_to_ord
+{
+    my $self = shift (@_);
+    my ($unicode_text) = @_;
+    my $bigendian_ord_array = [];
+    my @unicodechar_array = ($unicode_text =~ m/(..)/g);
+    foreach my $pair (@unicodechar_array) {
+    # for each 2 byte pair
+    my $c1=ord(substr($pair,0,1));
+    my $c2=ord(substr($pair,1,1));
+    my $be_ord = ($self->{'bigendian'}) ? $c1*256+$c2 : $c2*256+$c1;
+    push(@$bigendian_ord_array,$be_ord);
+    }
+    return $bigendian_ord_array;
+}
 …
         return;
+        }
+        else {
+        # put back all three bytes
+        $handle->ungetc($b3);
+        $handle->ungetc($b2);
+        $handle->ungetc($b1); return;
+        }
+    }
+    else {
+        # put back two bytes read
+        $handle->ungetc($b2);
+        $handle->ungetc($b1); return;
+    }
     } else { # $b1 != fe or ff
+    # put back the one byte read
     $handle->ungetc($b1); return;
+    }
+    # if here, we have removed some chars and they aren't a BOM
+    if ($self->{'encoding'} eq "unicode") { # return the 2byte char
+    if (defined ($b3)) { # we looked at this... return it
+        $handle->ungetc($b3);
+    }
+    return &unicode::unicode2utf8([$self->{'bigendian'}?
+                       ($b1*256+$b2) : ($b2*256+$b1)]);
+    }
+    # if here, it's utf-8
+    if ($b2 < 0x80) {
+    if (defined ($b3)) { # we grabbed this, but don't need it now
+        $handle->ungetc($b3);
+    }
+    return ($b1 . $b2);
+    }
+    # if here, we have taken part of a multi-byte char. we need to make
+    # sure we return the entire character
+    if (defined($b3) && $b3 < 0x80) { # we have all we need
+    $handle->ungetc($b3);
+    return ($b1 . $b2);
+    }
+    my $c=$b1.$b2.$b3;
+    my $b4=$handle->getc();
+    while ($b4 > 0x7f) { # note - this will return consecutive mb utf8 chars
+    $c .= $b4;
+    $b4=$handle->getc();
+    if (eof($handle)) { last }
+    }
+    if (! eof($handle)) {
+    $handle->ungetc($b4); # this byte is an ascii byte
+    }
+    return $c;
+}
+sub read_file_no_decoding
+{
+    my $self = shift (@_);
+    my ($outputref) = @_;
+    # make sure we have a file handle
+    return if ($self->{'handle'} eq "");
+    my $handle = $self->{'handle'};
+    # if encoding is set to utf8 or unicode, sniff to see if there is a
+    # byte order marker
+    if ($self->{'first'} &&
+    ($self->{'encoding'} eq "utf8" || $self->{'encoding'} eq 'unicode')) {
+    # this will change $self's encoding if there is a BOM (but won't consume any characters)
+    $self->find_unicode_bom();
+    }
+    undef $/;
+    $$outputref .=  <$handle>;
+    $/ = "\n";
+}
+# will convert entire contents of file to utf8 and append result to $outputref
+# this may be a slightly faster way to get the contents of a file than by
+# recursively calling read_line()
+sub decode_text {
+    my $self = shift (@_);
+    my ($raw_text,$decoded_text_ref) = @_;
+    if ($self->{'encoding'} eq "utf8") {
+    # Nothing to do, raw text is in utf 8
+    $$decoded_text_ref .= $raw_text;
+    return;
+    }
+    if ($self->{'encoding'} eq "unicode") {
+    my $unicode_array = $self->unicodechar_to_ord($raw_text);
+    $$decoded_text_ref .= &unicode::unicode2utf8($unicode_array);
+    return;
+    }
+    if ($self->{'encoding'} eq "iso_8859_1" || $self->{'encoding'} eq "ascii") {
+    # we'll use ascii2utf8() for this as it's faster than going
+    # through convert2unicode()
+    $$decoded_text_ref .= &unicode::ascii2utf8 (\$raw_text);
+    return;
+    }
+    # everything else uses unicode::convert2unicode
+    $$decoded_text_ref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$raw_text));
+}
 …
     my ($outputref) = @_;
+    # make sure we have a file handle
+    return if ($self->{'handle'} eq "");
+    my $handle = $self->{'handle'};
+    # if encoding is set to utf8 or unicode, sniff to see if there is a
+    # byte order marker
+    if ($self->{'first'} &&
+    ($self->{'encoding'} eq "utf8" || $self->{'encoding'} eq 'unicode')) {
+    # this will change $self's encoding if there is a BOM
+    my $read_text = $self->find_unicode_bom();
+    $$outputref .= $read_text if (defined($read_text));
+    }
+    if ($self->{'encoding'} eq "utf8") {
+    undef $/;
+    $$outputref .=  <$handle>;
+    $/ = "\n";
+    return;
+    }
+    if ($self->{'encoding'} eq "unicode") {
+    my $line = "";
+    while (defined ($line = $self->read_line())) {
+        $$outputref .= $line;
+    }
+    return;
+    }
+    if ($self->{'encoding'} eq "iso_8859_1" || $self->{'encoding'} eq "ascii") {
+    # we'll use ascii2utf8() for this as it's faster than going
+    # through convert2unicode()
+    undef $/;
+    my $text = <$handle>;
+    $/ = "\n";
+    $$outputref .= &unicode::ascii2utf8 (\$text);
+    return;
+    }
+    # everything else uses unicode::convert2unicode
+    undef $/;
+    my $text = <$handle>;
+    $/ = "\n";
+    $$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text));
+    # While unusual, $raw_text is initialized to $$outputref
+    # to be consistent with code before refactoring
+    my $raw_text = $$outputref;
+    $self->read_file_no_decoding(\$raw_text);
+    $self->decode_text($raw_text,$outputref);
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 16719

Legend:

gsdl/trunk/perllib/multiread.pm

Download in other formats: