Context Navigation

source: gsdl/trunk/perllib/multiread.pm@ 20647

Last change on this file since 20647 was 20579, checked in by davidb, 15 years ago
Unicode encoding broken down into two steps.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 8.6 KB

Rev	Line
[627]	1	###########################################################################
	2	#
	3	# multiread.pm --
	4	#
	5	# Copyright (C) 1999 DigiLib Systems Limited, NZ
[9411]	6	# Copyright (C) 2005 New Zealand Digital Library project
[627]	7	#
	8	# This program is free software; you can redistribute it and/or modify
	9	# it under the terms of the GNU General Public License as published by
	10	# the Free Software Foundation; either version 2 of the License, or
	11	# (at your option) any later version.
	12	#
	13	# This program is distributed in the hope that it will be useful,
	14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	# GNU General Public License for more details.
	17	#
	18	# You should have received a copy of the GNU General Public License
	19	# along with this program; if not, write to the Free Software
	20	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	21	#
	22	###########################################################################
	23
	24	# the multiread object will read in a number of encodings,
	25	# the results are always returned in the utf-8 format
	26
	27	# encodings currently supported are
	28	#
[1844]	29	# utf8 - either utf8 or unicode (automatically detected)
[9411]	30	# unicode - 2-byte UCS (does endian detection)
[1870]	31	#
	32	# plus all encodings in the "encodings" package
[627]	33
	34	package multiread;
	35
[3834]	36	eval {require bytes};
[3767]	37
[15894]	38	use strict;
[16375]	39	no strict 'refs'; # allow filehandles to be variables and viceversa
	40
[627]	41	use unicode;
	42
	43	sub new {
	44	my ($class) = @_;
	45
	46	my $self = {'handle' => "",
	47	'first' => 1,
	48	'encoding' => "utf8",
	49	'bigendian' => 1};
	50
	51	return bless $self, $class;
	52	}
	53
	54	# set_handle expects the file to be already open but
	55	# not read yet
	56	sub set_handle {
[9410]	57	my $self = shift;
	58	$self->{'handle'} = shift;
	59	binmode( $self->{'handle'} );
[627]	60	$self->{'first'} = 1;
	61	$self->{'encoding'} = "utf8";
	62	$self->{'bigendian'} = 1;
	63	}
	64
	65	# set_encoding should be called after set_handle
	66	sub set_encoding {
[9410]	67	my $self = shift;
[9414]	68	$self->{'encoding'} = shift;
[627]	69	}
	70
	71	sub get_encoding {
	72	my $self = shift (@_);
	73	return $self->{'encoding'};
	74	}
	75
	76	# undef will be returned if the eof has been reached
	77	# the result will always be returned in utf-8
[9410]	78
[1868]	79	sub read_unicode_char {
[627]	80	my $self = shift (@_);
	81
	82	# make sure we have a file handle
	83	return undef if ($self->{'handle'} eq "");
	84	my $handle = $self->{'handle'};
	85
	86	if ($self->{'encoding'} eq "utf8") {
	87	# utf-8 text, how many characters we get depends
	88	# on what we find
	89	my $c1 = "";
	90	my $c2 = "";
	91	my $c3 = "";
	92
	93	while (!eof ($handle)) {
	94	$c1 = ord (getc ($handle));
	95
	96	if ($c1 <= 0x7f) {
	97	# one byte character
	98	return chr ($c1);
	99
	100	} elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
	101	# two byte character
	102	$c2 = getc ($handle) if (!eof ($handle));
	103	return chr ($c1) . $c2;
	104
	105	} elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
	106	# three byte character
	107	$c2 = getc ($handle) if (!eof ($handle));
	108	$c3 = getc ($handle) if (!eof ($handle));
	109	return chr ($c1) . $c2 . $c3;
	110	}
	111
	112	# if we get here there was an error in the file, we should
	113	# be able to recover from it however, maybe the file is in
	114	# another encoding
	115	}
	116
	117	return undef if (eof ($handle));
	118	}
	119
	120	if ($self->{'encoding'} eq "unicode") {
	121	# unicode text, get the next two characters
	122	return undef if (eof ($handle));
	123	my $c1 = ord (getc ($handle));
	124	return undef if (eof ($handle));
	125	my $c2 = ord (getc ($handle));
	126
	127	return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
	128	}
	129
	130	return undef;
	131	}
	132
	133
[16719]	134	sub unicodechar_to_ord
	135	{
	136	my $self = shift (@_);
	137	my ($unicode_text) = @_;
	138
	139	my $bigendian_ord_array = [];
	140
	141	my @unicodechar_array = ($unicode_text =~ m/(..)/g);
	142
	143	foreach my $pair (@unicodechar_array) {
	144	# for each 2 byte pair
	145	my $c1=ord(substr($pair,0,1));
	146	my $c2=ord(substr($pair,1,1));
	147
	148	my $be_ord = ($self->{'bigendian'}) ? $c1256+$c2 : $c2256+$c1;
	149	push(@$bigendian_ord_array,$be_ord);
	150	}
	151
	152	return $bigendian_ord_array;
	153	}
	154
	155
[627]	156	# undef will be returned if the eof has been reached
	157	# the result will always be returned in utf-8
	158	sub read_line {
	159	my $self = shift (@_);
	160
	161	# make sure we have a file handle
	162	return undef if ($self->{'handle'} eq "");
	163
	164	my $handle = $self->{'handle'};
	165
	166	if ($self->{'encoding'} eq "utf8") {
	167	# utf-8 line
	168	return <$handle>;
	169	}
	170
	171	if ($self->{'encoding'} eq "unicode") {
	172	# unicode line
	173	my $c = "";
	174	my ($c1, $c2) = ("", "");
	175	my $out = "";
	176	while (read ($handle, $c, 2) == 2) {
	177	$c1 = ord (substr ($c, 0, 1));
	178	$c2 = ord (substr ($c, 1, 1));
	179	$c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
	180	$out .= $c;
	181	last if ($c eq "\n");
	182	}
	183
	184	return $out if (length ($out) > 0);
	185	return undef;
	186	}
	187
[1227]	188	if ($self->{'encoding'} eq "iso_8859_1") {
[1870]	189	# we'll use ascii2utf8() for this as it's faster than going
	190	# through convert2unicode()
[627]	191	my $line = "";
	192	if (defined ($line = <$handle>)) {
[1870]	193	return &unicode::ascii2utf8 (\$line);
[627]	194	}
	195	}
[1844]	196
[1870]	197	# everything else uses unicode::convert2unicode
	198	my $line = "";
	199	if (defined ($line = <$handle>)) {
	200	return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
[1227]	201	}
[1844]	202
[627]	203	return undef;
	204	}
	205
	206
[9410]	207
	208	# this will look for a Byte Order Marker at the start of the file, and
	209	# set the encoding appropriately if there is one, returning any
	210	# non-marker text on the first line (or returns undef).
	211	sub find_unicode_bom {
	212	my $self=shift;
	213
	214	my $non_bom_text=""; # to return if we read in 'real' text
	215
	216	if ($self->{'first'} == 0) { return }
	217
	218	# make sure we have a file handle
	219	return if ($self->{'handle'} eq "");
	220	my $handle = $self->{'handle'};
	221
	222	$self->{'first'} = 0;
	223
[16793]	224	my $gc = getc ($handle);
	225	my $b1 = ord($gc);
[9410]	226	my $b2;
	227	my $b3;
	228
	229	if ($b1 == 0xfe \|\| $b1 == 0xff) {
	230	$b2 = ord (getc ($handle)) if (!eof ($handle));
	231	if ($b1 == 0xff && $b2 == 0xfe) {
	232	$self->{'encoding'} = "unicode";
	233	$self->{'bigendian'} = 0;
	234	return;
	235	} elsif ($b1 == 0xfe && $b2 == 0xff) {
	236	$self->{'encoding'} = "unicode";
	237	$self->{'bigendian'} = 1;
	238	return;
	239	} elsif ($b1 == 0xef && $b2 == 0xbb) {
	240	$b3 = ord(getc($handle));
	241	if ($b3 == 0xbf) {
	242	$self->{'encoding'} = "utf8";
	243	$self->{'bigendian'} = 1;
	244	return;
	245	}
[16719]	246	else {
	247	# put back all three bytes
	248	$handle->ungetc($b3);
	249	$handle->ungetc($b2);
	250	$handle->ungetc($b1); return;
	251
	252	}
[9410]	253	}
[16719]	254	else {
	255	# put back two bytes read
	256	$handle->ungetc($b2);
	257	$handle->ungetc($b1); return;
	258	}
[9410]	259	} else { # $b1 != fe or ff
[16719]	260	# put back the one byte read
[9410]	261	$handle->ungetc($b1); return;
	262	}
	263	}
	264
	265
[16719]	266	sub read_file_no_decoding
	267	{
[1224]	268	my $self = shift (@_);
	269	my ($outputref) = @_;
	270
	271	# make sure we have a file handle
	272	return if ($self->{'handle'} eq "");
	273
	274	my $handle = $self->{'handle'};
	275
[9410]	276	# if encoding is set to utf8 or unicode, sniff to see if there is a
	277	# byte order marker
	278	if ($self->{'first'} &&
	279	($self->{'encoding'} eq "utf8" \|\| $self->{'encoding'} eq 'unicode')) {
[16719]	280
	281	# this will change $self's encoding if there is a BOM (but won't consume any characters)
	282	$self->find_unicode_bom();
[1224]	283	}
	284
[16719]	285	undef $/;
	286	$$outputref .= <$handle>;
	287	$/ = "\n";
[20579]	288
[16719]	289	}
	290
	291
	292	# will convert entire contents of file to utf8 and append result to $outputref
	293	# this may be a slightly faster way to get the contents of a file than by
	294	# recursively calling read_line()
	295	sub decode_text {
	296	my $self = shift (@_);
	297
	298	my ($raw_text,$decoded_text_ref) = @_;
	299
[1224]	300	if ($self->{'encoding'} eq "utf8") {
[16719]	301	# Nothing to do, raw text is in utf 8
	302	$$decoded_text_ref .= $raw_text;
[1224]	303	return;
	304	}
	305
	306	if ($self->{'encoding'} eq "unicode") {
[16719]	307	my $unicode_array = $self->unicodechar_to_ord($raw_text);
	308	$$decoded_text_ref .= &unicode::unicode2utf8($unicode_array);
[1224]	309	return;
	310	}
	311
[12832]	312	if ($self->{'encoding'} eq "iso_8859_1" \|\| $self->{'encoding'} eq "ascii") {
[1870]	313	# we'll use ascii2utf8() for this as it's faster than going
	314	# through convert2unicode()
[16719]	315	$$decoded_text_ref .= &unicode::ascii2utf8 (\$raw_text);
[1224]	316	return;
	317	}
[1868]	318
[1870]	319	# everything else uses unicode::convert2unicode
[20579]	320	my $unicode_text = &unicode::convert2unicode ($self->{'encoding'}, \$raw_text);
	321
	322	$$decoded_text_ref .= &unicode::unicode2utf8 ($unicode_text);
	323
	324	### print STDERR "!!! decoded ", join(":",map { ord($_) } split(//,$$decoded_text_ref)), "\n";
[1224]	325	}
	326
[16719]	327
	328
	329	# will convert entire contents of file to utf8 and append result to $outputref
	330	# this may be a slightly faster way to get the contents of a file than by
	331	# recursively calling read_line()
	332	sub read_file {
	333	my $self = shift (@_);
	334	my ($outputref) = @_;
	335
	336	# While unusual, $raw_text is initialized to $$outputref
	337	# to be consistent with code before refactoring
	338	my $raw_text = $$outputref;
	339
	340	$self->read_file_no_decoding(\$raw_text);
	341	$self->decode_text($raw_text,$outputref);
	342	}
	343
[627]	344	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: