Context Navigation

source: trunk/gsdl/perllib/multiread.pm@ 1991

Last change on this file since 1991 was 1870, checked in by sjboddie, 23 years ago
Tidied up language support stuff.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.8 KB

Rev	Line
[627]	1	###########################################################################
	2	#
	3	# multiread.pm --
	4	#
	5	# Copyright (C) 1999 DigiLib Systems Limited, NZ
	6	#
	7	# This program is free software; you can redistribute it and/or modify
	8	# it under the terms of the GNU General Public License as published by
	9	# the Free Software Foundation; either version 2 of the License, or
	10	# (at your option) any later version.
	11	#
	12	# This program is distributed in the hope that it will be useful,
	13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	# GNU General Public License for more details.
	16	#
	17	# You should have received a copy of the GNU General Public License
	18	# along with this program; if not, write to the Free Software
	19	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	20	#
	21	###########################################################################
	22
	23	# the multiread object will read in a number of encodings,
	24	# the results are always returned in the utf-8 format
	25
	26	# encodings currently supported are
	27	#
[1844]	28	# utf8 - either utf8 or unicode (automatically detected)
	29	# unicode - just unicode (doesn't currently do endian detection)
[1870]	30	#
	31	# plus all encodings in the "encodings" package
[627]	32
	33	package multiread;
	34
	35	use unicode;
	36
	37	sub new {
	38	my ($class) = @_;
	39
	40	my $self = {'handle' => "",
	41	'first' => 1,
	42	'encoding' => "utf8",
	43	'bigendian' => 1};
	44
	45	return bless $self, $class;
	46	}
	47
	48	# set_handle expects the file to be already open but
	49	# not read yet
	50	sub set_handle {
	51	my $self = shift (@_);
	52	($self->{'handle'}) = @_;
	53	$self->{'first'} = 1;
	54	$self->{'encoding'} = "utf8";
	55	$self->{'bigendian'} = 1;
	56	}
	57
	58	# set_encoding should be called after set_handle
	59	sub set_encoding {
	60	my $self = shift (@_);
	61	($self->{'encoding'}) = @_;
	62	}
	63
	64	sub get_encoding {
	65	my $self = shift (@_);
	66	return $self->{'encoding'};
	67	}
	68
	69	# undef will be returned if the eof has been reached
	70	# the result will always be returned in utf-8
	71	# if automatic detection between utf8 and unicode is desired
	72	# then the encoding should be initially set to utf8
[1868]	73	sub read_unicode_char {
[627]	74	my $self = shift (@_);
	75
	76	# make sure we have a file handle
	77	return undef if ($self->{'handle'} eq "");
	78	my $handle = $self->{'handle'};
[1868]	79	binmode ($handle);
[627]	80
	81	if ($self->{'encoding'} eq "utf8") {
	82	# utf-8 text, how many characters we get depends
	83	# on what we find
	84	my $c1 = "";
	85	my $c2 = "";
	86	my $c3 = "";
	87
	88	while (!eof ($handle)) {
	89	$c1 = ord (getc ($handle));
	90	if ($self->{'first'}) {
	91	$self->{'first'} = 0;
	92
	93	if ($c1 == 0xfe \|\| $c1 == 0xff) {
	94	$c2 = ord (getc ($handle)) if (!eof ($handle));
	95
	96	# if unicode fall through to the unicode reading code
	97	if ($c1 == 0xff && $c2 == 0xfe) {
	98	$self->{'encoding'} = "unicode";
	99	$self->{'bigendian'} = 0;
	100	last;
	101
	102	} elsif ($c1 == 0xfe && $c2 == 0xff) {
	103	$self->{'encoding'} = "unicode";
	104	$self->{'bigendian'} = 1;
	105	last;
	106	}
	107
	108	# an error, but we might be able to recover
	109	# from it
	110	$c1 = $c2;
	111	}
	112	}
	113
	114	if ($c1 <= 0x7f) {
	115	# one byte character
	116	return chr ($c1);
	117
	118	} elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
	119	# two byte character
	120	$c2 = getc ($handle) if (!eof ($handle));
	121	return chr ($c1) . $c2;
	122
	123	} elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
	124	# three byte character
	125	$c2 = getc ($handle) if (!eof ($handle));
	126	$c3 = getc ($handle) if (!eof ($handle));
	127	return chr ($c1) . $c2 . $c3;
	128	}
	129
	130	# if we get here there was an error in the file, we should
	131	# be able to recover from it however, maybe the file is in
	132	# another encoding
	133	}
	134
	135	return undef if (eof ($handle));
	136	}
	137
	138	if ($self->{'encoding'} eq "unicode") {
	139	# unicode text, get the next two characters
	140	return undef if (eof ($handle));
	141	my $c1 = ord (getc ($handle));
	142	return undef if (eof ($handle));
	143	my $c2 = ord (getc ($handle));
	144
	145	return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
	146	}
	147
	148	return undef;
	149	}
	150
	151
	152	# undef will be returned if the eof has been reached
	153	# the result will always be returned in utf-8
	154	sub read_line {
	155	my $self = shift (@_);
	156
	157	# make sure we have a file handle
	158	return undef if ($self->{'handle'} eq "");
	159
	160	my $handle = $self->{'handle'};
	161
	162	if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
	163	# special case for the first line of utf8 text to detect whether
	164	# the file is in utf8 or unicode
	165	my $out = "";
	166	my $thisc = "";
[1868]	167	while (defined ($thisc = $self->read_unicode_char())) {
[627]	168	$out .= $thisc;
	169	last if ($thisc eq "\n");
	170	}
	171
	172	return $out if (length ($out) > 0);
	173	return undef;
	174	}
	175
	176	if ($self->{'encoding'} eq "utf8") {
	177	# utf-8 line
	178	return <$handle>;
	179	}
	180
	181	if ($self->{'encoding'} eq "unicode") {
	182	# unicode line
	183	my $c = "";
	184	my ($c1, $c2) = ("", "");
	185	my $out = "";
	186	while (read ($handle, $c, 2) == 2) {
	187	$c1 = ord (substr ($c, 0, 1));
	188	$c2 = ord (substr ($c, 1, 1));
	189	$c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
	190	$out .= $c;
	191	last if ($c eq "\n");
	192	}
	193
	194	return $out if (length ($out) > 0);
	195	return undef;
	196	}
	197
[1227]	198	if ($self->{'encoding'} eq "iso_8859_1") {
[1870]	199	# we'll use ascii2utf8() for this as it's faster than going
	200	# through convert2unicode()
[627]	201	my $line = "";
	202	if (defined ($line = <$handle>)) {
[1870]	203	return &unicode::ascii2utf8 (\$line);
[627]	204	}
	205	}
[1844]	206
[1870]	207	# everything else uses unicode::convert2unicode
	208	my $line = "";
	209	if (defined ($line = <$handle>)) {
	210	return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
[1227]	211	}
[1844]	212
[627]	213	return undef;
	214	}
	215
	216
[1224]	217	# will convert entire contents of file to utf8 and append result to $outputref
[1227]	218	# this may be a slightly faster way to get the contents of a file than by
	219	# recursively calling read_line()
[1224]	220	sub read_file {
	221	my $self = shift (@_);
	222	my ($outputref) = @_;
	223
	224	# make sure we have a file handle
	225	return if ($self->{'handle'} eq "");
	226
	227	my $handle = $self->{'handle'};
	228
	229	if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
	230	# special case for the first line of utf8 text to detect whether
	231	# the file is in utf8 or unicode
	232	$$text .= $self->read_line ();
	233	}
	234
	235	if ($self->{'encoding'} eq "utf8") {
	236	undef $/;
	237	$$outputref .= <$handle>;
	238	$/ = "\n";
	239	return;
	240	}
	241
	242	if ($self->{'encoding'} eq "unicode") {
	243	my $line = "";
	244	while (defined ($line = $self->read_line())) {
	245	$$outputref .= $line;
	246	}
	247	return;
	248	}
	249
[1227]	250	if ($self->{'encoding'} eq "iso_8859_1") {
[1870]	251	# we'll use ascii2utf8() for this as it's faster than going
	252	# through convert2unicode()
[1224]	253	undef $/;
[1227]	254	my $text = <$handle>;
[1224]	255	$/ = "\n";
[1870]	256	$$outputref .= &unicode::ascii2utf8 (\$text);
[1224]	257	return;
	258	}
[1868]	259
[1870]	260	# everything else uses unicode::convert2unicode
[1868]	261	undef $/;
	262	my $text = <$handle>;
	263	$/ = "\n";
[1870]	264	$$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text));
[1224]	265	}
	266
[627]	267	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: