Changeset 16719

Show
Ignore:
Timestamp:
12.08.2008 14:04:13 (11 years ago)
Author:
ak19
Message:

Dr Bainbridge has changed several methods so that they can now be called to work on strings rather than having to always read strings in from a file

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/multiread.pm

    r16375 r16719  
    129129 
    130130    return undef; 
     131} 
     132 
     133 
     134sub unicodechar_to_ord 
     135{ 
     136    my $self = shift (@_); 
     137    my ($unicode_text) = @_; 
     138 
     139    my $bigendian_ord_array = []; 
     140 
     141    my @unicodechar_array = ($unicode_text =~ m/(..)/g); 
     142 
     143    foreach my $pair (@unicodechar_array) { 
     144    # for each 2 byte pair        
     145    my $c1=ord(substr($pair,0,1)); 
     146    my $c2=ord(substr($pair,1,1)); 
     147 
     148    my $be_ord = ($self->{'bigendian'}) ? $c1*256+$c2 : $c2*256+$c1; 
     149    push(@$bigendian_ord_array,$be_ord); 
     150    } 
     151 
     152    return $bigendian_ord_array; 
    131153} 
    132154 
     
    221243        return; 
    222244        } 
     245        else { 
     246        # put back all three bytes 
     247        $handle->ungetc($b3); 
     248        $handle->ungetc($b2); 
     249        $handle->ungetc($b1); return; 
     250 
     251        } 
     252    } 
     253    else { 
     254        # put back two bytes read 
     255        $handle->ungetc($b2); 
     256        $handle->ungetc($b1); return; 
    223257    } 
    224258    } else { # $b1 != fe or ff 
     259    # put back the one byte read 
    225260    $handle->ungetc($b1); return; 
    226261    } 
    227     # if here, we have removed some chars and they aren't a BOM 
    228     if ($self->{'encoding'} eq "unicode") { # return the 2byte char 
    229     if (defined ($b3)) { # we looked at this... return it 
    230         $handle->ungetc($b3); 
    231     } 
    232     return &unicode::unicode2utf8([$self->{'bigendian'}? 
    233                        ($b1*256+$b2) : ($b2*256+$b1)]); 
    234     } 
    235     # if here, it's utf-8 
    236     if ($b2 < 0x80) { 
    237     if (defined ($b3)) { # we grabbed this, but don't need it now 
    238         $handle->ungetc($b3); 
    239     } 
    240     return ($b1 . $b2); 
    241     } 
    242     # if here, we have taken part of a multi-byte char. we need to make 
    243     # sure we return the entire character 
    244     if (defined($b3) && $b3 < 0x80) { # we have all we need 
    245     $handle->ungetc($b3); 
    246     return ($b1 . $b2); 
    247     } 
    248     my $c=$b1.$b2.$b3; 
    249     my $b4=$handle->getc(); 
    250     while ($b4 > 0x7f) { # note - this will return consecutive mb utf8 chars 
    251     $c .= $b4; 
    252     $b4=$handle->getc(); 
    253     if (eof($handle)) { last } 
    254     } 
    255     if (! eof($handle)) { 
    256     $handle->ungetc($b4); # this byte is an ascii byte 
    257     } 
    258     return $c; 
     262} 
     263 
     264 
     265sub read_file_no_decoding 
     266{ 
     267    my $self = shift (@_); 
     268    my ($outputref) = @_; 
     269 
     270    # make sure we have a file handle 
     271    return if ($self->{'handle'} eq ""); 
     272 
     273    my $handle = $self->{'handle'}; 
     274 
     275    # if encoding is set to utf8 or unicode, sniff to see if there is a  
     276    # byte order marker 
     277    if ($self->{'first'} && 
     278    ($self->{'encoding'} eq "utf8" || $self->{'encoding'} eq 'unicode')) { 
     279 
     280    # this will change $self's encoding if there is a BOM (but won't consume any characters) 
     281    $self->find_unicode_bom();  
     282    } 
     283 
     284    undef $/; 
     285    $$outputref .=  <$handle>; 
     286    $/ = "\n"; 
     287} 
     288 
     289 
     290# will convert entire contents of file to utf8 and append result to $outputref 
     291# this may be a slightly faster way to get the contents of a file than by  
     292# recursively calling read_line() 
     293sub decode_text { 
     294    my $self = shift (@_); 
     295 
     296    my ($raw_text,$decoded_text_ref) = @_; 
     297 
     298    if ($self->{'encoding'} eq "utf8") { 
     299    # Nothing to do, raw text is in utf 8 
     300    $$decoded_text_ref .= $raw_text; 
     301    return; 
     302    } 
     303 
     304    if ($self->{'encoding'} eq "unicode") { 
     305    my $unicode_array = $self->unicodechar_to_ord($raw_text); 
     306    $$decoded_text_ref .= &unicode::unicode2utf8($unicode_array); 
     307    return; 
     308    } 
     309 
     310    if ($self->{'encoding'} eq "iso_8859_1" || $self->{'encoding'} eq "ascii") { 
     311    # we'll use ascii2utf8() for this as it's faster than going 
     312    # through convert2unicode() 
     313    $$decoded_text_ref .= &unicode::ascii2utf8 (\$raw_text); 
     314    return; 
     315    } 
     316 
     317    # everything else uses unicode::convert2unicode 
     318    $$decoded_text_ref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$raw_text)); 
    259319} 
    260320 
     
    268328    my ($outputref) = @_; 
    269329 
    270     # make sure we have a file handle 
    271     return if ($self->{'handle'} eq ""); 
    272  
    273     my $handle = $self->{'handle'}; 
    274  
    275     # if encoding is set to utf8 or unicode, sniff to see if there is a  
    276     # byte order marker 
    277     if ($self->{'first'} && 
    278     ($self->{'encoding'} eq "utf8" || $self->{'encoding'} eq 'unicode')) { 
    279     # this will change $self's encoding if there is a BOM 
    280     my $read_text = $self->find_unicode_bom(); 
    281     $$outputref .= $read_text if (defined($read_text)); 
    282     } 
    283  
    284     if ($self->{'encoding'} eq "utf8") { 
    285     undef $/; 
    286     $$outputref .=  <$handle>; 
    287     $/ = "\n"; 
    288     return; 
    289     } 
    290  
    291     if ($self->{'encoding'} eq "unicode") { 
    292     my $line = ""; 
    293     while (defined ($line = $self->read_line())) { 
    294         $$outputref .= $line; 
    295     } 
    296     return; 
    297     } 
    298  
    299     if ($self->{'encoding'} eq "iso_8859_1" || $self->{'encoding'} eq "ascii") { 
    300     # we'll use ascii2utf8() for this as it's faster than going 
    301     # through convert2unicode() 
    302     undef $/; 
    303     my $text = <$handle>; 
    304     $/ = "\n"; 
    305     $$outputref .= &unicode::ascii2utf8 (\$text); 
    306     return; 
    307     } 
    308  
    309     # everything else uses unicode::convert2unicode 
    310     undef $/; 
    311     my $text = <$handle>; 
    312     $/ = "\n"; 
    313     $$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text)); 
     330    # While unusual, $raw_text is initialized to $$outputref 
     331    # to be consistent with code before refactoring 
     332    my $raw_text = $$outputref;  
     333 
     334    $self->read_file_no_decoding(\$raw_text); 
     335    $self->decode_text($raw_text,$outputref); 
    314336} 
    315337