Context Navigation

source: gsdl/trunk/perllib/multiread.pm@ 16013

Last change on this file since 16013 was 15894, checked in by mdewsnip, 16 years ago
Added "use strict" to the files missing it.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 8.1 KB

Rev	Line
[627]	1	###########################################################################
	2	#
	3	# multiread.pm --
	4	#
	5	# Copyright (C) 1999 DigiLib Systems Limited, NZ
[9411]	6	# Copyright (C) 2005 New Zealand Digital Library project
[627]	7	#
	8	# This program is free software; you can redistribute it and/or modify
	9	# it under the terms of the GNU General Public License as published by
	10	# the Free Software Foundation; either version 2 of the License, or
	11	# (at your option) any later version.
	12	#
	13	# This program is distributed in the hope that it will be useful,
	14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	# GNU General Public License for more details.
	17	#
	18	# You should have received a copy of the GNU General Public License
	19	# along with this program; if not, write to the Free Software
	20	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	21	#
	22	###########################################################################
	23
	24	# the multiread object will read in a number of encodings,
	25	# the results are always returned in the utf-8 format
	26
	27	# encodings currently supported are
	28	#
[1844]	29	# utf8 - either utf8 or unicode (automatically detected)
[9411]	30	# unicode - 2-byte UCS (does endian detection)
[1870]	31	#
	32	# plus all encodings in the "encodings" package
[627]	33
	34	package multiread;
	35
[3834]	36	eval {require bytes};
[3767]	37
[15894]	38	use strict;
[627]	39	use unicode;
	40
	41	sub new {
	42	my ($class) = @_;
	43
	44	my $self = {'handle' => "",
	45	'first' => 1,
	46	'encoding' => "utf8",
	47	'bigendian' => 1};
	48
	49	return bless $self, $class;
	50	}
	51
	52	# set_handle expects the file to be already open but
	53	# not read yet
	54	sub set_handle {
[9410]	55	my $self = shift;
	56	$self->{'handle'} = shift;
	57	binmode( $self->{'handle'} );
[627]	58	$self->{'first'} = 1;
	59	$self->{'encoding'} = "utf8";
	60	$self->{'bigendian'} = 1;
	61	}
	62
	63	# set_encoding should be called after set_handle
	64	sub set_encoding {
[9410]	65	my $self = shift;
[9414]	66	$self->{'encoding'} = shift;
[627]	67	}
	68
	69	sub get_encoding {
	70	my $self = shift (@_);
	71	return $self->{'encoding'};
	72	}
	73
	74	# undef will be returned if the eof has been reached
	75	# the result will always be returned in utf-8
[9410]	76
[1868]	77	sub read_unicode_char {
[627]	78	my $self = shift (@_);
	79
	80	# make sure we have a file handle
	81	return undef if ($self->{'handle'} eq "");
	82	my $handle = $self->{'handle'};
	83
	84	if ($self->{'encoding'} eq "utf8") {
	85	# utf-8 text, how many characters we get depends
	86	# on what we find
	87	my $c1 = "";
	88	my $c2 = "";
	89	my $c3 = "";
	90
	91	while (!eof ($handle)) {
	92	$c1 = ord (getc ($handle));
	93
	94	if ($c1 <= 0x7f) {
	95	# one byte character
	96	return chr ($c1);
	97
	98	} elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
	99	# two byte character
	100	$c2 = getc ($handle) if (!eof ($handle));
	101	return chr ($c1) . $c2;
	102
	103	} elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
	104	# three byte character
	105	$c2 = getc ($handle) if (!eof ($handle));
	106	$c3 = getc ($handle) if (!eof ($handle));
	107	return chr ($c1) . $c2 . $c3;
	108	}
	109
	110	# if we get here there was an error in the file, we should
	111	# be able to recover from it however, maybe the file is in
	112	# another encoding
	113	}
	114
	115	return undef if (eof ($handle));
	116	}
	117
	118	if ($self->{'encoding'} eq "unicode") {
	119	# unicode text, get the next two characters
	120	return undef if (eof ($handle));
	121	my $c1 = ord (getc ($handle));
	122	return undef if (eof ($handle));
	123	my $c2 = ord (getc ($handle));
	124
	125	return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
	126	}
	127
	128	return undef;
	129	}
	130
	131
	132	# undef will be returned if the eof has been reached
	133	# the result will always be returned in utf-8
	134	sub read_line {
	135	my $self = shift (@_);
	136
	137	# make sure we have a file handle
	138	return undef if ($self->{'handle'} eq "");
	139
	140	my $handle = $self->{'handle'};
	141
	142	if ($self->{'encoding'} eq "utf8") {
	143	# utf-8 line
	144	return <$handle>;
	145	}
	146
	147	if ($self->{'encoding'} eq "unicode") {
	148	# unicode line
	149	my $c = "";
	150	my ($c1, $c2) = ("", "");
	151	my $out = "";
	152	while (read ($handle, $c, 2) == 2) {
	153	$c1 = ord (substr ($c, 0, 1));
	154	$c2 = ord (substr ($c, 1, 1));
	155	$c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
	156	$out .= $c;
	157	last if ($c eq "\n");
	158	}
	159
	160	return $out if (length ($out) > 0);
	161	return undef;
	162	}
	163
[1227]	164	if ($self->{'encoding'} eq "iso_8859_1") {
[1870]	165	# we'll use ascii2utf8() for this as it's faster than going
	166	# through convert2unicode()
[627]	167	my $line = "";
	168	if (defined ($line = <$handle>)) {
[1870]	169	return &unicode::ascii2utf8 (\$line);
[627]	170	}
	171	}
[1844]	172
[1870]	173	# everything else uses unicode::convert2unicode
	174	my $line = "";
	175	if (defined ($line = <$handle>)) {
	176	return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
[1227]	177	}
[1844]	178
[627]	179	return undef;
	180	}
	181
	182
[9410]	183
	184	# this will look for a Byte Order Marker at the start of the file, and
	185	# set the encoding appropriately if there is one, returning any
	186	# non-marker text on the first line (or returns undef).
	187	sub find_unicode_bom {
	188	my $self=shift;
	189
	190	my $non_bom_text=""; # to return if we read in 'real' text
	191
	192	if ($self->{'first'} == 0) { return }
	193
	194	# make sure we have a file handle
	195	return if ($self->{'handle'} eq "");
	196	my $handle = $self->{'handle'};
	197
	198	$self->{'first'} = 0;
	199
	200	my $b1 = ord(getc ($handle));
	201	my $b2;
	202	my $b3;
	203
	204	if ($b1 == 0xfe \|\| $b1 == 0xff) {
	205	$b2 = ord (getc ($handle)) if (!eof ($handle));
	206	if ($b1 == 0xff && $b2 == 0xfe) {
	207	$self->{'encoding'} = "unicode";
	208	$self->{'bigendian'} = 0;
	209	return;
	210	} elsif ($b1 == 0xfe && $b2 == 0xff) {
	211	$self->{'encoding'} = "unicode";
	212	$self->{'bigendian'} = 1;
	213	return;
	214	} elsif ($b1 == 0xef && $b2 == 0xbb) {
	215	$b3 = ord(getc($handle));
	216	if ($b3 == 0xbf) {
	217	$self->{'encoding'} = "utf8";
	218	$self->{'bigendian'} = 1;
	219	return;
	220	}
	221	}
	222	} else { # $b1 != fe or ff
	223	$handle->ungetc($b1); return;
	224	}
	225	# if here, we have removed some chars and they aren't a BOM
	226	if ($self->{'encoding'} eq "unicode") { # return the 2byte char
	227	if (defined ($b3)) { # we looked at this... return it
	228	$handle->ungetc($b3);
	229	}
	230	return &unicode::unicode2utf8([$self->{'bigendian'}?
	231	($b1256+$b2) : ($b2256+$b1)]);
	232	}
	233	# if here, it's utf-8
	234	if ($b2 < 0x80) {
	235	if (defined ($b3)) { # we grabbed this, but don't need it now
	236	$handle->ungetc($b3);
	237	}
	238	return ($b1 . $b2);
	239	}
	240	# if here, we have taken part of a multi-byte char. we need to make
	241	# sure we return the entire character
	242	if (defined($b3) && $b3 < 0x80) { # we have all we need
	243	$handle->ungetc($b3);
	244	return ($b1 . $b2);
	245	}
	246	my $c=$b1.$b2.$b3;
	247	my $b4=$handle->getc();
	248	while ($b4 > 0x7f) { # note - this will return consecutive mb utf8 chars
	249	$c .= $b4;
	250	$b4=$handle->getc();
	251	if (eof($handle)) { last }
	252	}
	253	if (! eof($handle)) {
	254	$handle->ungetc($b4); # this byte is an ascii byte
	255	}
	256	return $c;
	257	}
	258
	259
	260
[1224]	261	# will convert entire contents of file to utf8 and append result to $outputref
[1227]	262	# this may be a slightly faster way to get the contents of a file than by
	263	# recursively calling read_line()
[1224]	264	sub read_file {
	265	my $self = shift (@_);
	266	my ($outputref) = @_;
	267
	268	# make sure we have a file handle
	269	return if ($self->{'handle'} eq "");
	270
	271	my $handle = $self->{'handle'};
	272
[9410]	273	# if encoding is set to utf8 or unicode, sniff to see if there is a
	274	# byte order marker
	275	if ($self->{'first'} &&
	276	($self->{'encoding'} eq "utf8" \|\| $self->{'encoding'} eq 'unicode')) {
	277	# this will change $self's encoding if there is a BOM
	278	my $read_text = $self->find_unicode_bom();
[3520]	279	$$outputref .= $read_text if (defined($read_text));
[1224]	280	}
	281
	282	if ($self->{'encoding'} eq "utf8") {
	283	undef $/;
	284	$$outputref .= <$handle>;
	285	$/ = "\n";
	286	return;
	287	}
	288
	289	if ($self->{'encoding'} eq "unicode") {
	290	my $line = "";
	291	while (defined ($line = $self->read_line())) {
	292	$$outputref .= $line;
	293	}
	294	return;
	295	}
	296
[12832]	297	if ($self->{'encoding'} eq "iso_8859_1" \|\| $self->{'encoding'} eq "ascii") {
[1870]	298	# we'll use ascii2utf8() for this as it's faster than going
	299	# through convert2unicode()
[1224]	300	undef $/;
[1227]	301	my $text = <$handle>;
[1224]	302	$/ = "\n";
[1870]	303	$$outputref .= &unicode::ascii2utf8 (\$text);
[1224]	304	return;
	305	}
[1868]	306
[1870]	307	# everything else uses unicode::convert2unicode
[1868]	308	undef $/;
	309	my $text = <$handle>;
	310	$/ = "\n";
[1870]	311	$$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text));
[1224]	312	}
	313
[627]	314	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: