Changeset 16719
- Timestamp:
- 2008-08-12T14:04:13+12:00 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/multiread.pm
r16375 r16719 129 129 130 130 return undef; 131 } 132 133 134 sub unicodechar_to_ord 135 { 136 my $self = shift (@_); 137 my ($unicode_text) = @_; 138 139 my $bigendian_ord_array = []; 140 141 my @unicodechar_array = ($unicode_text =~ m/(..)/g); 142 143 foreach my $pair (@unicodechar_array) { 144 # for each 2 byte pair 145 my $c1=ord(substr($pair,0,1)); 146 my $c2=ord(substr($pair,1,1)); 147 148 my $be_ord = ($self->{'bigendian'}) ? $c1*256+$c2 : $c2*256+$c1; 149 push(@$bigendian_ord_array,$be_ord); 150 } 151 152 return $bigendian_ord_array; 131 153 } 132 154 … … 221 243 return; 222 244 } 245 else { 246 # put back all three bytes 247 $handle->ungetc($b3); 248 $handle->ungetc($b2); 249 $handle->ungetc($b1); return; 250 251 } 252 } 253 else { 254 # put back two bytes read 255 $handle->ungetc($b2); 256 $handle->ungetc($b1); return; 223 257 } 224 258 } else { # $b1 != fe or ff 259 # put back the one byte read 225 260 $handle->ungetc($b1); return; 226 261 } 227 # if here, we have removed some chars and they aren't a BOM 228 if ($self->{'encoding'} eq "unicode") { # return the 2byte char 229 if (defined ($b3)) { # we looked at this... return it 230 $handle->ungetc($b3); 231 } 232 return &unicode::unicode2utf8([$self->{'bigendian'}? 233 ($b1*256+$b2) : ($b2*256+$b1)]); 234 } 235 # if here, it's utf-8 236 if ($b2 < 0x80) { 237 if (defined ($b3)) { # we grabbed this, but don't need it now 238 $handle->ungetc($b3); 239 } 240 return ($b1 . $b2); 241 } 242 # if here, we have taken part of a multi-byte char. we need to make 243 # sure we return the entire character 244 if (defined($b3) && $b3 < 0x80) { # we have all we need 245 $handle->ungetc($b3); 246 return ($b1 . $b2); 247 } 248 my $c=$b1.$b2.$b3; 249 my $b4=$handle->getc(); 250 while ($b4 > 0x7f) { # note - this will return consecutive mb utf8 chars 251 $c .= $b4; 252 $b4=$handle->getc(); 253 if (eof($handle)) { last } 254 } 255 if (! eof($handle)) { 256 $handle->ungetc($b4); # this byte is an ascii byte 257 } 258 return $c; 262 } 263 264 265 sub read_file_no_decoding 266 { 267 my $self = shift (@_); 268 my ($outputref) = @_; 269 270 # make sure we have a file handle 271 return if ($self->{'handle'} eq ""); 272 273 my $handle = $self->{'handle'}; 274 275 # if encoding is set to utf8 or unicode, sniff to see if there is a 276 # byte order marker 277 if ($self->{'first'} && 278 ($self->{'encoding'} eq "utf8" || $self->{'encoding'} eq 'unicode')) { 279 280 # this will change $self's encoding if there is a BOM (but won't consume any characters) 281 $self->find_unicode_bom(); 282 } 283 284 undef $/; 285 $$outputref .= <$handle>; 286 $/ = "\n"; 287 } 288 289 290 # will convert entire contents of file to utf8 and append result to $outputref 291 # this may be a slightly faster way to get the contents of a file than by 292 # recursively calling read_line() 293 sub decode_text { 294 my $self = shift (@_); 295 296 my ($raw_text,$decoded_text_ref) = @_; 297 298 if ($self->{'encoding'} eq "utf8") { 299 # Nothing to do, raw text is in utf 8 300 $$decoded_text_ref .= $raw_text; 301 return; 302 } 303 304 if ($self->{'encoding'} eq "unicode") { 305 my $unicode_array = $self->unicodechar_to_ord($raw_text); 306 $$decoded_text_ref .= &unicode::unicode2utf8($unicode_array); 307 return; 308 } 309 310 if ($self->{'encoding'} eq "iso_8859_1" || $self->{'encoding'} eq "ascii") { 311 # we'll use ascii2utf8() for this as it's faster than going 312 # through convert2unicode() 313 $$decoded_text_ref .= &unicode::ascii2utf8 (\$raw_text); 314 return; 315 } 316 317 # everything else uses unicode::convert2unicode 318 $$decoded_text_ref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$raw_text)); 259 319 } 260 320 … … 268 328 my ($outputref) = @_; 269 329 270 # make sure we have a file handle 271 return if ($self->{'handle'} eq ""); 272 273 my $handle = $self->{'handle'}; 274 275 # if encoding is set to utf8 or unicode, sniff to see if there is a 276 # byte order marker 277 if ($self->{'first'} && 278 ($self->{'encoding'} eq "utf8" || $self->{'encoding'} eq 'unicode')) { 279 # this will change $self's encoding if there is a BOM 280 my $read_text = $self->find_unicode_bom(); 281 $$outputref .= $read_text if (defined($read_text)); 282 } 283 284 if ($self->{'encoding'} eq "utf8") { 285 undef $/; 286 $$outputref .= <$handle>; 287 $/ = "\n"; 288 return; 289 } 290 291 if ($self->{'encoding'} eq "unicode") { 292 my $line = ""; 293 while (defined ($line = $self->read_line())) { 294 $$outputref .= $line; 295 } 296 return; 297 } 298 299 if ($self->{'encoding'} eq "iso_8859_1" || $self->{'encoding'} eq "ascii") { 300 # we'll use ascii2utf8() for this as it's faster than going 301 # through convert2unicode() 302 undef $/; 303 my $text = <$handle>; 304 $/ = "\n"; 305 $$outputref .= &unicode::ascii2utf8 (\$text); 306 return; 307 } 308 309 # everything else uses unicode::convert2unicode 310 undef $/; 311 my $text = <$handle>; 312 $/ = "\n"; 313 $$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text)); 330 # While unusual, $raw_text is initialized to $$outputref 331 # to be consistent with code before refactoring 332 my $raw_text = $$outputref; 333 334 $self->read_file_no_decoding(\$raw_text); 335 $self->decode_text($raw_text,$outputref); 314 336 } 315 337
Note:
See TracChangeset
for help on using the changeset viewer.