Changeset 9410
- Timestamp:
- 2005-03-14T13:35:37+13:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/multiread.pm
r3834 r9410 51 51 # not read yet 52 52 sub set_handle { 53 my $self = shift (@_); 54 ($self->{'handle'}) = @_; 53 my $self = shift; 54 $self->{'handle'} = shift; 55 binmode( $self->{'handle'} ); 55 56 $self->{'first'} = 1; 56 57 $self->{'encoding'} = "utf8"; … … 60 61 # set_encoding should be called after set_handle 61 62 sub set_encoding { 62 my $self = shift (@_); 63 ($self->{'encoding'}) = @_; 63 my $self = shift; 64 my $enc = shift; 65 if ($enc eq 'unicode_le') { 66 $enc='unicode'; 67 $self->{'bigendian'} = 0; 68 } 69 $self->{'encoding'} = $enc; 64 70 } 65 71 … … 71 77 # undef will be returned if the eof has been reached 72 78 # the result will always be returned in utf-8 73 # if automatic detection between utf8 and unicode is desired 74 # then the encoding should be initially set to utf8 79 75 80 sub read_unicode_char { 76 81 my $self = shift (@_); … … 79 84 return undef if ($self->{'handle'} eq ""); 80 85 my $handle = $self->{'handle'}; 81 binmode ($handle);82 86 83 87 if ($self->{'encoding'} eq "utf8") { … … 90 94 while (!eof ($handle)) { 91 95 $c1 = ord (getc ($handle)); 92 if ($self->{'first'}) {93 $self->{'first'} = 0;94 95 if ($c1 == 0xfe || $c1 == 0xff) {96 $c2 = ord (getc ($handle)) if (!eof ($handle));97 98 # if unicode fall through to the unicode reading code99 if ($c1 == 0xff && $c2 == 0xfe) {100 $self->{'encoding'} = "unicode";101 $self->{'bigendian'} = 0;102 last;103 104 } elsif ($c1 == 0xfe && $c2 == 0xff) {105 $self->{'encoding'} = "unicode";106 $self->{'bigendian'} = 1;107 last;108 }109 110 # an error, but we might be able to recover111 # from it112 $c1 = $c2;113 }114 }115 96 116 97 if ($c1 <= 0x7f) { … … 161 142 162 143 my $handle = $self->{'handle'}; 163 164 if ($self->{'first'} && $self->{'encoding'} eq "utf8") {165 # special case for the first line of utf8 text to detect whether166 # the file is in utf8 or unicode167 my $out = "";168 my $thisc = "";169 while (defined ($thisc = $self->read_unicode_char())) {170 $out .= $thisc;171 last if ($thisc eq "\n");172 }173 174 return $out if (length ($out) > 0);175 return undef;176 }177 144 178 145 if ($self->{'encoding'} eq "utf8") { … … 217 184 218 185 186 187 # this will look for a Byte Order Marker at the start of the file, and 188 # set the encoding appropriately if there is one, returning any 189 # non-marker text on the first line (or returns undef). 190 sub find_unicode_bom { 191 my $self=shift; 192 193 my $non_bom_text=""; # to return if we read in 'real' text 194 195 if ($self->{'first'} == 0) { return } 196 197 # make sure we have a file handle 198 return if ($self->{'handle'} eq ""); 199 my $handle = $self->{'handle'}; 200 201 $self->{'first'} = 0; 202 203 my $b1 = ord(getc ($handle)); 204 my $b2; 205 my $b3; 206 207 if ($b1 == 0xfe || $b1 == 0xff) { 208 $b2 = ord (getc ($handle)) if (!eof ($handle)); 209 if ($b1 == 0xff && $b2 == 0xfe) { 210 $self->{'encoding'} = "unicode"; 211 $self->{'bigendian'} = 0; 212 return; 213 } elsif ($b1 == 0xfe && $b2 == 0xff) { 214 $self->{'encoding'} = "unicode"; 215 $self->{'bigendian'} = 1; 216 return; 217 } elsif ($b1 == 0xef && $b2 == 0xbb) { 218 $b3 = ord(getc($handle)); 219 if ($b3 == 0xbf) { 220 $self->{'encoding'} = "utf8"; 221 $self->{'bigendian'} = 1; 222 return; 223 } 224 } 225 } else { # $b1 != fe or ff 226 $handle->ungetc($b1); return; 227 } 228 # if here, we have removed some chars and they aren't a BOM 229 if ($self->{'encoding'} eq "unicode") { # return the 2byte char 230 if (defined ($b3)) { # we looked at this... return it 231 $handle->ungetc($b3); 232 } 233 return &unicode::unicode2utf8([$self->{'bigendian'}? 234 ($b1*256+$b2) : ($b2*256+$b1)]); 235 } 236 # if here, it's utf-8 237 if ($b2 < 0x80) { 238 if (defined ($b3)) { # we grabbed this, but don't need it now 239 $handle->ungetc($b3); 240 } 241 return ($b1 . $b2); 242 } 243 # if here, we have taken part of a multi-byte char. we need to make 244 # sure we return the entire character 245 if (defined($b3) && $b3 < 0x80) { # we have all we need 246 $handle->ungetc($b3); 247 return ($b1 . $b2); 248 } 249 my $c=$b1.$b2.$b3; 250 my $b4=$handle->getc(); 251 while ($b4 > 0x7f) { # note - this will return consecutive mb utf8 chars 252 $c .= $b4; 253 $b4=$handle->getc(); 254 if (eof($handle)) { last } 255 } 256 if (! eof($handle)) { 257 $handle->ungetc($b4); # this byte is an ascii byte 258 } 259 return $c; 260 } 261 262 263 219 264 # will convert entire contents of file to utf8 and append result to $outputref 220 265 # this may be a slightly faster way to get the contents of a file than by … … 229 274 my $handle = $self->{'handle'}; 230 275 231 if ($self->{'first'} && $self->{'encoding'} eq "utf8") {232 # special case for the first line of utf8 text to detect whether233 # the file is in utf8 or unicode 234 235 # possible to have no text here...236 my $read_text = $self-> read_line();276 # if encoding is set to utf8 or unicode, sniff to see if there is a 277 # byte order marker 278 if ($self->{'first'} && 279 ($self->{'encoding'} eq "utf8" || $self->{'encoding'} eq 'unicode')) { 280 # this will change $self's encoding if there is a BOM 281 my $read_text = $self->find_unicode_bom(); 237 282 $$outputref .= $read_text if (defined($read_text)); 238 283 }
Note:
See TracChangeset
for help on using the changeset viewer.