########################################################################### # # multiread.pm -- # # Copyright (C) 1999 DigiLib Systems Limited, NZ # Copyright (C) 2005 New Zealand Digital Library project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # the multiread object will read in a number of encodings, # the results are always returned in the utf-8 format # encodings currently supported are # # utf8 - either utf8 or unicode (automatically detected) # unicode - 2-byte UCS (does endian detection) # # plus all encodings in the "encodings" package package multiread; eval {require bytes}; use strict; no strict 'refs'; # allow filehandles to be variables and viceversa use unicode; sub new { my ($class) = @_; my $self = {'handle' => "", 'first' => 1, 'encoding' => "utf8", 'bigendian' => 1}; return bless $self, $class; } # set_handle expects the file to be already open but # not read yet sub set_handle { my $self = shift; $self->{'handle'} = shift; binmode( $self->{'handle'} ); $self->{'first'} = 1; $self->{'encoding'} = "utf8"; $self->{'bigendian'} = 1; } # set_encoding should be called after set_handle sub set_encoding { my $self = shift; $self->{'encoding'} = shift; } sub get_encoding { my $self = shift (@_); return $self->{'encoding'}; } # undef will be returned if the eof has been reached # the result will always be returned in utf-8 sub read_unicode_char { my $self = shift (@_); # make sure we have a file handle return undef if ($self->{'handle'} eq ""); my $handle = $self->{'handle'}; if ($self->{'encoding'} eq "utf8") { # utf-8 text, how many characters we get depends # on what we find my $c1 = ""; my $c2 = ""; my $c3 = ""; while (!eof ($handle)) { $c1 = ord (getc ($handle)); if ($c1 <= 0x7f) { # one byte character return chr ($c1); } elsif ($c1 >= 0xc0 && $c1 <= 0xdf) { # two byte character $c2 = getc ($handle) if (!eof ($handle)); return chr ($c1) . $c2; } elsif ($c1 >= 0xe0 && $c1 <= 0xef) { # three byte character $c2 = getc ($handle) if (!eof ($handle)); $c3 = getc ($handle) if (!eof ($handle)); return chr ($c1) . $c2 . $c3; } # if we get here there was an error in the file, we should # be able to recover from it however, maybe the file is in # another encoding } return undef if (eof ($handle)); } if ($self->{'encoding'} eq "unicode") { # unicode text, get the next two characters return undef if (eof ($handle)); my $c1 = ord (getc ($handle)); return undef if (eof ($handle)); my $c2 = ord (getc ($handle)); return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]); } return undef; } sub unicodechar_to_ord { my $self = shift (@_); my ($unicode_text) = @_; my $bigendian_ord_array = []; my @unicodechar_array = ($unicode_text =~ m/(..)/g); foreach my $pair (@unicodechar_array) { # for each 2 byte pair my $c1=ord(substr($pair,0,1)); my $c2=ord(substr($pair,1,1)); my $be_ord = ($self->{'bigendian'}) ? $c1*256+$c2 : $c2*256+$c1; push(@$bigendian_ord_array,$be_ord); } return $bigendian_ord_array; } # undef will be returned if the eof has been reached # the result will always be returned in utf-8 sub read_line { my $self = shift (@_); # make sure we have a file handle return undef if ($self->{'handle'} eq ""); my $handle = $self->{'handle'}; if ($self->{'encoding'} eq "utf8") { # utf-8 line return <$handle>; } if ($self->{'encoding'} eq "unicode") { # unicode line my $c = ""; my ($c1, $c2) = ("", ""); my $out = ""; while (read ($handle, $c, 2) == 2) { $c1 = ord (substr ($c, 0, 1)); $c2 = ord (substr ($c, 1, 1)); $c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]); $out .= $c; last if ($c eq "\n"); } return $out if (length ($out) > 0); return undef; } if ($self->{'encoding'} eq "iso_8859_1") { # we'll use ascii2utf8() for this as it's faster than going # through convert2unicode() my $line = ""; if (defined ($line = <$handle>)) { return &unicode::ascii2utf8 (\$line); } } # everything else uses unicode::convert2unicode my $line = ""; if (defined ($line = <$handle>)) { return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line)); } return undef; } # this will look for a Byte Order Marker at the start of the file, and # set the encoding appropriately if there is one, returning any # non-marker text on the first line (or returns undef). sub find_unicode_bom { my $self=shift; my $non_bom_text=""; # to return if we read in 'real' text if ($self->{'first'} == 0) { return } # make sure we have a file handle return if ($self->{'handle'} eq ""); my $handle = $self->{'handle'}; $self->{'first'} = 0; my $gc = getc ($handle); my $b1 = ord($gc); my $b2; my $b3; if ($b1 == 0xfe || $b1 == 0xff) { $b2 = ord (getc ($handle)) if (!eof ($handle)); if ($b1 == 0xff && $b2 == 0xfe) { $self->{'encoding'} = "unicode"; $self->{'bigendian'} = 0; return; } elsif ($b1 == 0xfe && $b2 == 0xff) { $self->{'encoding'} = "unicode"; $self->{'bigendian'} = 1; return; } elsif ($b1 == 0xef && $b2 == 0xbb) { $b3 = ord(getc($handle)); if ($b3 == 0xbf) { $self->{'encoding'} = "utf8"; $self->{'bigendian'} = 1; return; } else { # put back all three bytes $handle->ungetc($b3); $handle->ungetc($b2); $handle->ungetc($b1); return; } } else { # put back two bytes read $handle->ungetc($b2); $handle->ungetc($b1); return; } } else { # $b1 != fe or ff # put back the one byte read $handle->ungetc($b1); return; } } sub read_file_no_decoding { my $self = shift (@_); my ($outputref) = @_; # make sure we have a file handle return if ($self->{'handle'} eq ""); my $handle = $self->{'handle'}; # if encoding is set to utf8 or unicode, sniff to see if there is a # byte order marker if ($self->{'first'} && ($self->{'encoding'} eq "utf8" || $self->{'encoding'} eq 'unicode')) { # this will change $self's encoding if there is a BOM (but won't consume any characters) $self->find_unicode_bom(); } undef $/; $$outputref .= <$handle>; $/ = "\n"; } # will convert entire contents of file to utf8 and append result to $outputref # this may be a slightly faster way to get the contents of a file than by # recursively calling read_line() sub decode_text { my $self = shift (@_); my ($raw_text,$decoded_text_ref) = @_; if ($self->{'encoding'} eq "utf8") { # Nothing to do, raw text is in utf 8 $$decoded_text_ref .= $raw_text; return; } if ($self->{'encoding'} eq "unicode") { my $unicode_array = $self->unicodechar_to_ord($raw_text); $$decoded_text_ref .= &unicode::unicode2utf8($unicode_array); return; } if ($self->{'encoding'} eq "iso_8859_1" || $self->{'encoding'} eq "ascii") { # we'll use ascii2utf8() for this as it's faster than going # through convert2unicode() $$decoded_text_ref .= &unicode::ascii2utf8 (\$raw_text); return; } # everything else uses unicode::convert2unicode $$decoded_text_ref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$raw_text)); } # will convert entire contents of file to utf8 and append result to $outputref # this may be a slightly faster way to get the contents of a file than by # recursively calling read_line() sub read_file { my $self = shift (@_); my ($outputref) = @_; # While unusual, $raw_text is initialized to $$outputref # to be consistent with code before refactoring my $raw_text = $$outputref; $self->read_file_no_decoding(\$raw_text); $self->decode_text($raw_text,$outputref); } 1;