Context Navigation

source: trunk/gsdl/perllib/multiread.pm@ 7703

Last change on this file since 7703 was 3834, checked in by sjboddie, 21 years ago
Prevent "use bytes" from causing errors for older perls
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.9 KB

Rev	Line
[627]	1	###########################################################################
	2	#
	3	# multiread.pm --
	4	#
	5	# Copyright (C) 1999 DigiLib Systems Limited, NZ
	6	#
	7	# This program is free software; you can redistribute it and/or modify
	8	# it under the terms of the GNU General Public License as published by
	9	# the Free Software Foundation; either version 2 of the License, or
	10	# (at your option) any later version.
	11	#
	12	# This program is distributed in the hope that it will be useful,
	13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	# GNU General Public License for more details.
	16	#
	17	# You should have received a copy of the GNU General Public License
	18	# along with this program; if not, write to the Free Software
	19	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	20	#
	21	###########################################################################
	22
	23	# the multiread object will read in a number of encodings,
	24	# the results are always returned in the utf-8 format
	25
	26	# encodings currently supported are
	27	#
[1844]	28	# utf8 - either utf8 or unicode (automatically detected)
	29	# unicode - just unicode (doesn't currently do endian detection)
[1870]	30	#
	31	# plus all encodings in the "encodings" package
[627]	32
	33	package multiread;
	34
[3834]	35	eval {require bytes};
[3767]	36
[627]	37	use unicode;
	38
	39	sub new {
	40	my ($class) = @_;
	41
	42	my $self = {'handle' => "",
	43	'first' => 1,
	44	'encoding' => "utf8",
	45	'bigendian' => 1};
	46
	47	return bless $self, $class;
	48	}
	49
	50	# set_handle expects the file to be already open but
	51	# not read yet
	52	sub set_handle {
	53	my $self = shift (@_);
	54	($self->{'handle'}) = @_;
	55	$self->{'first'} = 1;
	56	$self->{'encoding'} = "utf8";
	57	$self->{'bigendian'} = 1;
	58	}
	59
	60	# set_encoding should be called after set_handle
	61	sub set_encoding {
	62	my $self = shift (@_);
	63	($self->{'encoding'}) = @_;
	64	}
	65
	66	sub get_encoding {
	67	my $self = shift (@_);
	68	return $self->{'encoding'};
	69	}
	70
	71	# undef will be returned if the eof has been reached
	72	# the result will always be returned in utf-8
	73	# if automatic detection between utf8 and unicode is desired
	74	# then the encoding should be initially set to utf8
[1868]	75	sub read_unicode_char {
[627]	76	my $self = shift (@_);
	77
	78	# make sure we have a file handle
	79	return undef if ($self->{'handle'} eq "");
	80	my $handle = $self->{'handle'};
[1868]	81	binmode ($handle);
[627]	82
	83	if ($self->{'encoding'} eq "utf8") {
	84	# utf-8 text, how many characters we get depends
	85	# on what we find
	86	my $c1 = "";
	87	my $c2 = "";
	88	my $c3 = "";
	89
	90	while (!eof ($handle)) {
	91	$c1 = ord (getc ($handle));
	92	if ($self->{'first'}) {
	93	$self->{'first'} = 0;
	94
	95	if ($c1 == 0xfe \|\| $c1 == 0xff) {
	96	$c2 = ord (getc ($handle)) if (!eof ($handle));
	97
	98	# if unicode fall through to the unicode reading code
	99	if ($c1 == 0xff && $c2 == 0xfe) {
	100	$self->{'encoding'} = "unicode";
	101	$self->{'bigendian'} = 0;
	102	last;
	103
	104	} elsif ($c1 == 0xfe && $c2 == 0xff) {
	105	$self->{'encoding'} = "unicode";
	106	$self->{'bigendian'} = 1;
	107	last;
	108	}
	109
	110	# an error, but we might be able to recover
	111	# from it
	112	$c1 = $c2;
	113	}
	114	}
	115
	116	if ($c1 <= 0x7f) {
	117	# one byte character
	118	return chr ($c1);
	119
	120	} elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
	121	# two byte character
	122	$c2 = getc ($handle) if (!eof ($handle));
	123	return chr ($c1) . $c2;
	124
	125	} elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
	126	# three byte character
	127	$c2 = getc ($handle) if (!eof ($handle));
	128	$c3 = getc ($handle) if (!eof ($handle));
	129	return chr ($c1) . $c2 . $c3;
	130	}
	131
	132	# if we get here there was an error in the file, we should
	133	# be able to recover from it however, maybe the file is in
	134	# another encoding
	135	}
	136
	137	return undef if (eof ($handle));
	138	}
	139
	140	if ($self->{'encoding'} eq "unicode") {
	141	# unicode text, get the next two characters
	142	return undef if (eof ($handle));
	143	my $c1 = ord (getc ($handle));
	144	return undef if (eof ($handle));
	145	my $c2 = ord (getc ($handle));
	146
	147	return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
	148	}
	149
	150	return undef;
	151	}
	152
	153
	154	# undef will be returned if the eof has been reached
	155	# the result will always be returned in utf-8
	156	sub read_line {
	157	my $self = shift (@_);
	158
	159	# make sure we have a file handle
	160	return undef if ($self->{'handle'} eq "");
	161
	162	my $handle = $self->{'handle'};
	163
	164	if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
	165	# special case for the first line of utf8 text to detect whether
	166	# the file is in utf8 or unicode
	167	my $out = "";
	168	my $thisc = "";
[1868]	169	while (defined ($thisc = $self->read_unicode_char())) {
[627]	170	$out .= $thisc;
	171	last if ($thisc eq "\n");
	172	}
	173
	174	return $out if (length ($out) > 0);
	175	return undef;
	176	}
	177
	178	if ($self->{'encoding'} eq "utf8") {
	179	# utf-8 line
	180	return <$handle>;
	181	}
	182
	183	if ($self->{'encoding'} eq "unicode") {
	184	# unicode line
	185	my $c = "";
	186	my ($c1, $c2) = ("", "");
	187	my $out = "";
	188	while (read ($handle, $c, 2) == 2) {
	189	$c1 = ord (substr ($c, 0, 1));
	190	$c2 = ord (substr ($c, 1, 1));
	191	$c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
	192	$out .= $c;
	193	last if ($c eq "\n");
	194	}
	195
	196	return $out if (length ($out) > 0);
	197	return undef;
	198	}
	199
[1227]	200	if ($self->{'encoding'} eq "iso_8859_1") {
[1870]	201	# we'll use ascii2utf8() for this as it's faster than going
	202	# through convert2unicode()
[627]	203	my $line = "";
	204	if (defined ($line = <$handle>)) {
[1870]	205	return &unicode::ascii2utf8 (\$line);
[627]	206	}
	207	}
[1844]	208
[1870]	209	# everything else uses unicode::convert2unicode
	210	my $line = "";
	211	if (defined ($line = <$handle>)) {
	212	return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
[1227]	213	}
[1844]	214
[627]	215	return undef;
	216	}
	217
	218
[1224]	219	# will convert entire contents of file to utf8 and append result to $outputref
[1227]	220	# this may be a slightly faster way to get the contents of a file than by
	221	# recursively calling read_line()
[1224]	222	sub read_file {
	223	my $self = shift (@_);
	224	my ($outputref) = @_;
	225
	226	# make sure we have a file handle
	227	return if ($self->{'handle'} eq "");
	228
	229	my $handle = $self->{'handle'};
	230
	231	if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
	232	# special case for the first line of utf8 text to detect whether
	233	# the file is in utf8 or unicode
[3095]	234
	235	# possible to have no text here...
	236	my $read_text = $self->read_line ();
[3520]	237	$$outputref .= $read_text if (defined($read_text));
[1224]	238	}
	239
	240	if ($self->{'encoding'} eq "utf8") {
	241	undef $/;
	242	$$outputref .= <$handle>;
	243	$/ = "\n";
	244	return;
	245	}
	246
	247	if ($self->{'encoding'} eq "unicode") {
	248	my $line = "";
	249	while (defined ($line = $self->read_line())) {
	250	$$outputref .= $line;
	251	}
	252	return;
	253	}
	254
[1227]	255	if ($self->{'encoding'} eq "iso_8859_1") {
[1870]	256	# we'll use ascii2utf8() for this as it's faster than going
	257	# through convert2unicode()
[1224]	258	undef $/;
[1227]	259	my $text = <$handle>;
[1224]	260	$/ = "\n";
[1870]	261	$$outputref .= &unicode::ascii2utf8 (\$text);
[1224]	262	return;
	263	}
[1868]	264
[1870]	265	# everything else uses unicode::convert2unicode
[1868]	266	undef $/;
	267	my $text = <$handle>;
	268	$/ = "\n";
[1870]	269	$$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text));
[1224]	270	}
	271
[627]	272	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: