Changeset 16719


Ignore:
Timestamp:
2008-08-12T14:04:13+12:00 (16 years ago)
Author:
ak19
Message:

Dr Bainbridge has changed several methods so that they can now be called to work on strings rather than having to always read strings in from a file

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/multiread.pm

    r16375 r16719  
    129129
    130130    return undef;
     131}
     132
     133
     134sub unicodechar_to_ord
     135{
     136    my $self = shift (@_);
     137    my ($unicode_text) = @_;
     138
     139    my $bigendian_ord_array = [];
     140
     141    my @unicodechar_array = ($unicode_text =~ m/(..)/g);
     142
     143    foreach my $pair (@unicodechar_array) {
     144    # for each 2 byte pair       
     145    my $c1=ord(substr($pair,0,1));
     146    my $c2=ord(substr($pair,1,1));
     147
     148    my $be_ord = ($self->{'bigendian'}) ? $c1*256+$c2 : $c2*256+$c1;
     149    push(@$bigendian_ord_array,$be_ord);
     150    }
     151
     152    return $bigendian_ord_array;
    131153}
    132154
     
    221243        return;
    222244        }
     245        else {
     246        # put back all three bytes
     247        $handle->ungetc($b3);
     248        $handle->ungetc($b2);
     249        $handle->ungetc($b1); return;
     250
     251        }
     252    }
     253    else {
     254        # put back two bytes read
     255        $handle->ungetc($b2);
     256        $handle->ungetc($b1); return;
    223257    }
    224258    } else { # $b1 != fe or ff
     259    # put back the one byte read
    225260    $handle->ungetc($b1); return;
    226261    }
    227     # if here, we have removed some chars and they aren't a BOM
    228     if ($self->{'encoding'} eq "unicode") { # return the 2byte char
    229     if (defined ($b3)) { # we looked at this... return it
    230         $handle->ungetc($b3);
    231     }
    232     return &unicode::unicode2utf8([$self->{'bigendian'}?
    233                        ($b1*256+$b2) : ($b2*256+$b1)]);
    234     }
    235     # if here, it's utf-8
    236     if ($b2 < 0x80) {
    237     if (defined ($b3)) { # we grabbed this, but don't need it now
    238         $handle->ungetc($b3);
    239     }
    240     return ($b1 . $b2);
    241     }
    242     # if here, we have taken part of a multi-byte char. we need to make
    243     # sure we return the entire character
    244     if (defined($b3) && $b3 < 0x80) { # we have all we need
    245     $handle->ungetc($b3);
    246     return ($b1 . $b2);
    247     }
    248     my $c=$b1.$b2.$b3;
    249     my $b4=$handle->getc();
    250     while ($b4 > 0x7f) { # note - this will return consecutive mb utf8 chars
    251     $c .= $b4;
    252     $b4=$handle->getc();
    253     if (eof($handle)) { last }
    254     }
    255     if (! eof($handle)) {
    256     $handle->ungetc($b4); # this byte is an ascii byte
    257     }
    258     return $c;
     262}
     263
     264
     265sub read_file_no_decoding
     266{
     267    my $self = shift (@_);
     268    my ($outputref) = @_;
     269
     270    # make sure we have a file handle
     271    return if ($self->{'handle'} eq "");
     272
     273    my $handle = $self->{'handle'};
     274
     275    # if encoding is set to utf8 or unicode, sniff to see if there is a
     276    # byte order marker
     277    if ($self->{'first'} &&
     278    ($self->{'encoding'} eq "utf8" || $self->{'encoding'} eq 'unicode')) {
     279
     280    # this will change $self's encoding if there is a BOM (but won't consume any characters)
     281    $self->find_unicode_bom();
     282    }
     283
     284    undef $/;
     285    $$outputref .=  <$handle>;
     286    $/ = "\n";
     287}
     288
     289
     290# will convert entire contents of file to utf8 and append result to $outputref
     291# this may be a slightly faster way to get the contents of a file than by
     292# recursively calling read_line()
     293sub decode_text {
     294    my $self = shift (@_);
     295
     296    my ($raw_text,$decoded_text_ref) = @_;
     297
     298    if ($self->{'encoding'} eq "utf8") {
     299    # Nothing to do, raw text is in utf 8
     300    $$decoded_text_ref .= $raw_text;
     301    return;
     302    }
     303
     304    if ($self->{'encoding'} eq "unicode") {
     305    my $unicode_array = $self->unicodechar_to_ord($raw_text);
     306    $$decoded_text_ref .= &unicode::unicode2utf8($unicode_array);
     307    return;
     308    }
     309
     310    if ($self->{'encoding'} eq "iso_8859_1" || $self->{'encoding'} eq "ascii") {
     311    # we'll use ascii2utf8() for this as it's faster than going
     312    # through convert2unicode()
     313    $$decoded_text_ref .= &unicode::ascii2utf8 (\$raw_text);
     314    return;
     315    }
     316
     317    # everything else uses unicode::convert2unicode
     318    $$decoded_text_ref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$raw_text));
    259319}
    260320
     
    268328    my ($outputref) = @_;
    269329
    270     # make sure we have a file handle
    271     return if ($self->{'handle'} eq "");
    272 
    273     my $handle = $self->{'handle'};
    274 
    275     # if encoding is set to utf8 or unicode, sniff to see if there is a
    276     # byte order marker
    277     if ($self->{'first'} &&
    278     ($self->{'encoding'} eq "utf8" || $self->{'encoding'} eq 'unicode')) {
    279     # this will change $self's encoding if there is a BOM
    280     my $read_text = $self->find_unicode_bom();
    281     $$outputref .= $read_text if (defined($read_text));
    282     }
    283 
    284     if ($self->{'encoding'} eq "utf8") {
    285     undef $/;
    286     $$outputref .=  <$handle>;
    287     $/ = "\n";
    288     return;
    289     }
    290 
    291     if ($self->{'encoding'} eq "unicode") {
    292     my $line = "";
    293     while (defined ($line = $self->read_line())) {
    294         $$outputref .= $line;
    295     }
    296     return;
    297     }
    298 
    299     if ($self->{'encoding'} eq "iso_8859_1" || $self->{'encoding'} eq "ascii") {
    300     # we'll use ascii2utf8() for this as it's faster than going
    301     # through convert2unicode()
    302     undef $/;
    303     my $text = <$handle>;
    304     $/ = "\n";
    305     $$outputref .= &unicode::ascii2utf8 (\$text);
    306     return;
    307     }
    308 
    309     # everything else uses unicode::convert2unicode
    310     undef $/;
    311     my $text = <$handle>;
    312     $/ = "\n";
    313     $$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text));
     330    # While unusual, $raw_text is initialized to $$outputref
     331    # to be consistent with code before refactoring
     332    my $raw_text = $$outputref;
     333
     334    $self->read_file_no_decoding(\$raw_text);
     335    $self->decode_text($raw_text,$outputref);
    314336}
    315337
Note: See TracChangeset for help on using the changeset viewer.