Context Navigation

source: gsdl/trunk/perllib/multiread.pm@ 16375

Last change on this file since 16375 was 16375, checked in by kjdon, 16 years ago
need no strict refs for isisplugin
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 8.2 KB

Rev	Line
[627]	1	###########################################################################
	2	#
	3	# multiread.pm --
	4	#
	5	# Copyright (C) 1999 DigiLib Systems Limited, NZ
[9411]	6	# Copyright (C) 2005 New Zealand Digital Library project
[627]	7	#
	8	# This program is free software; you can redistribute it and/or modify
	9	# it under the terms of the GNU General Public License as published by
	10	# the Free Software Foundation; either version 2 of the License, or
	11	# (at your option) any later version.
	12	#
	13	# This program is distributed in the hope that it will be useful,
	14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	# GNU General Public License for more details.
	17	#
	18	# You should have received a copy of the GNU General Public License
	19	# along with this program; if not, write to the Free Software
	20	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	21	#
	22	###########################################################################
	23
	24	# the multiread object will read in a number of encodings,
	25	# the results are always returned in the utf-8 format
	26
	27	# encodings currently supported are
	28	#
[1844]	29	# utf8 - either utf8 or unicode (automatically detected)
[9411]	30	# unicode - 2-byte UCS (does endian detection)
[1870]	31	#
	32	# plus all encodings in the "encodings" package
[627]	33
	34	package multiread;
	35
[3834]	36	eval {require bytes};
[3767]	37
[15894]	38	use strict;
[16375]	39	no strict 'refs'; # allow filehandles to be variables and viceversa
	40
[627]	41	use unicode;
	42
	43	sub new {
	44	my ($class) = @_;
	45
	46	my $self = {'handle' => "",
	47	'first' => 1,
	48	'encoding' => "utf8",
	49	'bigendian' => 1};
	50
	51	return bless $self, $class;
	52	}
	53
	54	# set_handle expects the file to be already open but
	55	# not read yet
	56	sub set_handle {
[9410]	57	my $self = shift;
	58	$self->{'handle'} = shift;
	59	binmode( $self->{'handle'} );
[627]	60	$self->{'first'} = 1;
	61	$self->{'encoding'} = "utf8";
	62	$self->{'bigendian'} = 1;
	63	}
	64
	65	# set_encoding should be called after set_handle
	66	sub set_encoding {
[9410]	67	my $self = shift;
[9414]	68	$self->{'encoding'} = shift;
[627]	69	}
	70
	71	sub get_encoding {
	72	my $self = shift (@_);
	73	return $self->{'encoding'};
	74	}
	75
	76	# undef will be returned if the eof has been reached
	77	# the result will always be returned in utf-8
[9410]	78
[1868]	79	sub read_unicode_char {
[627]	80	my $self = shift (@_);
	81
	82	# make sure we have a file handle
	83	return undef if ($self->{'handle'} eq "");
	84	my $handle = $self->{'handle'};
	85
	86	if ($self->{'encoding'} eq "utf8") {
	87	# utf-8 text, how many characters we get depends
	88	# on what we find
	89	my $c1 = "";
	90	my $c2 = "";
	91	my $c3 = "";
	92
	93	while (!eof ($handle)) {
	94	$c1 = ord (getc ($handle));
	95
	96	if ($c1 <= 0x7f) {
	97	# one byte character
	98	return chr ($c1);
	99
	100	} elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
	101	# two byte character
	102	$c2 = getc ($handle) if (!eof ($handle));
	103	return chr ($c1) . $c2;
	104
	105	} elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
	106	# three byte character
	107	$c2 = getc ($handle) if (!eof ($handle));
	108	$c3 = getc ($handle) if (!eof ($handle));
	109	return chr ($c1) . $c2 . $c3;
	110	}
	111
	112	# if we get here there was an error in the file, we should
	113	# be able to recover from it however, maybe the file is in
	114	# another encoding
	115	}
	116
	117	return undef if (eof ($handle));
	118	}
	119
	120	if ($self->{'encoding'} eq "unicode") {
	121	# unicode text, get the next two characters
	122	return undef if (eof ($handle));
	123	my $c1 = ord (getc ($handle));
	124	return undef if (eof ($handle));
	125	my $c2 = ord (getc ($handle));
	126
	127	return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
	128	}
	129
	130	return undef;
	131	}
	132
	133
	134	# undef will be returned if the eof has been reached
	135	# the result will always be returned in utf-8
	136	sub read_line {
	137	my $self = shift (@_);
	138
	139	# make sure we have a file handle
	140	return undef if ($self->{'handle'} eq "");
	141
	142	my $handle = $self->{'handle'};
	143
	144	if ($self->{'encoding'} eq "utf8") {
	145	# utf-8 line
	146	return <$handle>;
	147	}
	148
	149	if ($self->{'encoding'} eq "unicode") {
	150	# unicode line
	151	my $c = "";
	152	my ($c1, $c2) = ("", "");
	153	my $out = "";
	154	while (read ($handle, $c, 2) == 2) {
	155	$c1 = ord (substr ($c, 0, 1));
	156	$c2 = ord (substr ($c, 1, 1));
	157	$c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
	158	$out .= $c;
	159	last if ($c eq "\n");
	160	}
	161
	162	return $out if (length ($out) > 0);
	163	return undef;
	164	}
	165
[1227]	166	if ($self->{'encoding'} eq "iso_8859_1") {
[1870]	167	# we'll use ascii2utf8() for this as it's faster than going
	168	# through convert2unicode()
[627]	169	my $line = "";
	170	if (defined ($line = <$handle>)) {
[1870]	171	return &unicode::ascii2utf8 (\$line);
[627]	172	}
	173	}
[1844]	174
[1870]	175	# everything else uses unicode::convert2unicode
	176	my $line = "";
	177	if (defined ($line = <$handle>)) {
	178	return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
[1227]	179	}
[1844]	180
[627]	181	return undef;
	182	}
	183
	184
[9410]	185
	186	# this will look for a Byte Order Marker at the start of the file, and
	187	# set the encoding appropriately if there is one, returning any
	188	# non-marker text on the first line (or returns undef).
	189	sub find_unicode_bom {
	190	my $self=shift;
	191
	192	my $non_bom_text=""; # to return if we read in 'real' text
	193
	194	if ($self->{'first'} == 0) { return }
	195
	196	# make sure we have a file handle
	197	return if ($self->{'handle'} eq "");
	198	my $handle = $self->{'handle'};
	199
	200	$self->{'first'} = 0;
	201
	202	my $b1 = ord(getc ($handle));
	203	my $b2;
	204	my $b3;
	205
	206	if ($b1 == 0xfe \|\| $b1 == 0xff) {
	207	$b2 = ord (getc ($handle)) if (!eof ($handle));
	208	if ($b1 == 0xff && $b2 == 0xfe) {
	209	$self->{'encoding'} = "unicode";
	210	$self->{'bigendian'} = 0;
	211	return;
	212	} elsif ($b1 == 0xfe && $b2 == 0xff) {
	213	$self->{'encoding'} = "unicode";
	214	$self->{'bigendian'} = 1;
	215	return;
	216	} elsif ($b1 == 0xef && $b2 == 0xbb) {
	217	$b3 = ord(getc($handle));
	218	if ($b3 == 0xbf) {
	219	$self->{'encoding'} = "utf8";
	220	$self->{'bigendian'} = 1;
	221	return;
	222	}
	223	}
	224	} else { # $b1 != fe or ff
	225	$handle->ungetc($b1); return;
	226	}
	227	# if here, we have removed some chars and they aren't a BOM
	228	if ($self->{'encoding'} eq "unicode") { # return the 2byte char
	229	if (defined ($b3)) { # we looked at this... return it
	230	$handle->ungetc($b3);
	231	}
	232	return &unicode::unicode2utf8([$self->{'bigendian'}?
	233	($b1256+$b2) : ($b2256+$b1)]);
	234	}
	235	# if here, it's utf-8
	236	if ($b2 < 0x80) {
	237	if (defined ($b3)) { # we grabbed this, but don't need it now
	238	$handle->ungetc($b3);
	239	}
	240	return ($b1 . $b2);
	241	}
	242	# if here, we have taken part of a multi-byte char. we need to make
	243	# sure we return the entire character
	244	if (defined($b3) && $b3 < 0x80) { # we have all we need
	245	$handle->ungetc($b3);
	246	return ($b1 . $b2);
	247	}
	248	my $c=$b1.$b2.$b3;
	249	my $b4=$handle->getc();
	250	while ($b4 > 0x7f) { # note - this will return consecutive mb utf8 chars
	251	$c .= $b4;
	252	$b4=$handle->getc();
	253	if (eof($handle)) { last }
	254	}
	255	if (! eof($handle)) {
	256	$handle->ungetc($b4); # this byte is an ascii byte
	257	}
	258	return $c;
	259	}
	260
	261
	262
[1224]	263	# will convert entire contents of file to utf8 and append result to $outputref
[1227]	264	# this may be a slightly faster way to get the contents of a file than by
	265	# recursively calling read_line()
[1224]	266	sub read_file {
	267	my $self = shift (@_);
	268	my ($outputref) = @_;
	269
	270	# make sure we have a file handle
	271	return if ($self->{'handle'} eq "");
	272
	273	my $handle = $self->{'handle'};
	274
[9410]	275	# if encoding is set to utf8 or unicode, sniff to see if there is a
	276	# byte order marker
	277	if ($self->{'first'} &&
	278	($self->{'encoding'} eq "utf8" \|\| $self->{'encoding'} eq 'unicode')) {
	279	# this will change $self's encoding if there is a BOM
	280	my $read_text = $self->find_unicode_bom();
[3520]	281	$$outputref .= $read_text if (defined($read_text));
[1224]	282	}
	283
	284	if ($self->{'encoding'} eq "utf8") {
	285	undef $/;
	286	$$outputref .= <$handle>;
	287	$/ = "\n";
	288	return;
	289	}
	290
	291	if ($self->{'encoding'} eq "unicode") {
	292	my $line = "";
	293	while (defined ($line = $self->read_line())) {
	294	$$outputref .= $line;
	295	}
	296	return;
	297	}
	298
[12832]	299	if ($self->{'encoding'} eq "iso_8859_1" \|\| $self->{'encoding'} eq "ascii") {
[1870]	300	# we'll use ascii2utf8() for this as it's faster than going
	301	# through convert2unicode()
[1224]	302	undef $/;
[1227]	303	my $text = <$handle>;
[1224]	304	$/ = "\n";
[1870]	305	$$outputref .= &unicode::ascii2utf8 (\$text);
[1224]	306	return;
	307	}
[1868]	308
[1870]	309	# everything else uses unicode::convert2unicode
[1868]	310	undef $/;
	311	my $text = <$handle>;
	312	$/ = "\n";
[1870]	313	$$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text));
[1224]	314	}
	315
[627]	316	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: