Context Navigation

source: gsdl/trunk/perllib/multiread.pm@ 15894

Last change on this file since 15894 was 15894, checked in by mdewsnip, 16 years ago
Added "use strict" to the files missing it.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 8.1 KB

Line
1	###########################################################################
2	#
3	# multiread.pm --
4	#
5	# Copyright (C) 1999 DigiLib Systems Limited, NZ
6	# Copyright (C) 2005 New Zealand Digital Library project
7	#
8	# This program is free software; you can redistribute it and/or modify
9	# it under the terms of the GNU General Public License as published by
10	# the Free Software Foundation; either version 2 of the License, or
11	# (at your option) any later version.
12	#
13	# This program is distributed in the hope that it will be useful,
14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	# GNU General Public License for more details.
17	#
18	# You should have received a copy of the GNU General Public License
19	# along with this program; if not, write to the Free Software
20	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21	#
22	###########################################################################
23
24	# the multiread object will read in a number of encodings,
25	# the results are always returned in the utf-8 format
26
27	# encodings currently supported are
28	#
29	# utf8 - either utf8 or unicode (automatically detected)
30	# unicode - 2-byte UCS (does endian detection)
31	#
32	# plus all encodings in the "encodings" package
33
34	package multiread;
35
36	eval {require bytes};
37
38	use strict;
39	use unicode;
40
41	sub new {
42	my ($class) = @_;
43
44	my $self = {'handle' => "",
45	'first' => 1,
46	'encoding' => "utf8",
47	'bigendian' => 1};
48
49	return bless $self, $class;
50	}
51
52	# set_handle expects the file to be already open but
53	# not read yet
54	sub set_handle {
55	my $self = shift;
56	$self->{'handle'} = shift;
57	binmode( $self->{'handle'} );
58	$self->{'first'} = 1;
59	$self->{'encoding'} = "utf8";
60	$self->{'bigendian'} = 1;
61	}
62
63	# set_encoding should be called after set_handle
64	sub set_encoding {
65	my $self = shift;
66	$self->{'encoding'} = shift;
67	}
68
69	sub get_encoding {
70	my $self = shift (@_);
71	return $self->{'encoding'};
72	}
73
74	# undef will be returned if the eof has been reached
75	# the result will always be returned in utf-8
76
77	sub read_unicode_char {
78	my $self = shift (@_);
79
80	# make sure we have a file handle
81	return undef if ($self->{'handle'} eq "");
82	my $handle = $self->{'handle'};
83
84	if ($self->{'encoding'} eq "utf8") {
85	# utf-8 text, how many characters we get depends
86	# on what we find
87	my $c1 = "";
88	my $c2 = "";
89	my $c3 = "";
90
91	while (!eof ($handle)) {
92	$c1 = ord (getc ($handle));
93
94	if ($c1 <= 0x7f) {
95	# one byte character
96	return chr ($c1);
97
98	} elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
99	# two byte character
100	$c2 = getc ($handle) if (!eof ($handle));
101	return chr ($c1) . $c2;
102
103	} elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
104	# three byte character
105	$c2 = getc ($handle) if (!eof ($handle));
106	$c3 = getc ($handle) if (!eof ($handle));
107	return chr ($c1) . $c2 . $c3;
108	}
109
110	# if we get here there was an error in the file, we should
111	# be able to recover from it however, maybe the file is in
112	# another encoding
113	}
114
115	return undef if (eof ($handle));
116	}
117
118	if ($self->{'encoding'} eq "unicode") {
119	# unicode text, get the next two characters
120	return undef if (eof ($handle));
121	my $c1 = ord (getc ($handle));
122	return undef if (eof ($handle));
123	my $c2 = ord (getc ($handle));
124
125	return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
126	}
127
128	return undef;
129	}
130
131
132	# undef will be returned if the eof has been reached
133	# the result will always be returned in utf-8
134	sub read_line {
135	my $self = shift (@_);
136
137	# make sure we have a file handle
138	return undef if ($self->{'handle'} eq "");
139
140	my $handle = $self->{'handle'};
141
142	if ($self->{'encoding'} eq "utf8") {
143	# utf-8 line
144	return <$handle>;
145	}
146
147	if ($self->{'encoding'} eq "unicode") {
148	# unicode line
149	my $c = "";
150	my ($c1, $c2) = ("", "");
151	my $out = "";
152	while (read ($handle, $c, 2) == 2) {
153	$c1 = ord (substr ($c, 0, 1));
154	$c2 = ord (substr ($c, 1, 1));
155	$c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
156	$out .= $c;
157	last if ($c eq "\n");
158	}
159
160	return $out if (length ($out) > 0);
161	return undef;
162	}
163
164	if ($self->{'encoding'} eq "iso_8859_1") {
165	# we'll use ascii2utf8() for this as it's faster than going
166	# through convert2unicode()
167	my $line = "";
168	if (defined ($line = <$handle>)) {
169	return &unicode::ascii2utf8 (\$line);
170	}
171	}
172
173	# everything else uses unicode::convert2unicode
174	my $line = "";
175	if (defined ($line = <$handle>)) {
176	return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
177	}
178
179	return undef;
180	}
181
182
183
184	# this will look for a Byte Order Marker at the start of the file, and
185	# set the encoding appropriately if there is one, returning any
186	# non-marker text on the first line (or returns undef).
187	sub find_unicode_bom {
188	my $self=shift;
189
190	my $non_bom_text=""; # to return if we read in 'real' text
191
192	if ($self->{'first'} == 0) { return }
193
194	# make sure we have a file handle
195	return if ($self->{'handle'} eq "");
196	my $handle = $self->{'handle'};
197
198	$self->{'first'} = 0;
199
200	my $b1 = ord(getc ($handle));
201	my $b2;
202	my $b3;
203
204	if ($b1 == 0xfe \|\| $b1 == 0xff) {
205	$b2 = ord (getc ($handle)) if (!eof ($handle));
206	if ($b1 == 0xff && $b2 == 0xfe) {
207	$self->{'encoding'} = "unicode";
208	$self->{'bigendian'} = 0;
209	return;
210	} elsif ($b1 == 0xfe && $b2 == 0xff) {
211	$self->{'encoding'} = "unicode";
212	$self->{'bigendian'} = 1;
213	return;
214	} elsif ($b1 == 0xef && $b2 == 0xbb) {
215	$b3 = ord(getc($handle));
216	if ($b3 == 0xbf) {
217	$self->{'encoding'} = "utf8";
218	$self->{'bigendian'} = 1;
219	return;
220	}
221	}
222	} else { # $b1 != fe or ff
223	$handle->ungetc($b1); return;
224	}
225	# if here, we have removed some chars and they aren't a BOM
226	if ($self->{'encoding'} eq "unicode") { # return the 2byte char
227	if (defined ($b3)) { # we looked at this... return it
228	$handle->ungetc($b3);
229	}
230	return &unicode::unicode2utf8([$self->{'bigendian'}?
231	($b1256+$b2) : ($b2256+$b1)]);
232	}
233	# if here, it's utf-8
234	if ($b2 < 0x80) {
235	if (defined ($b3)) { # we grabbed this, but don't need it now
236	$handle->ungetc($b3);
237	}
238	return ($b1 . $b2);
239	}
240	# if here, we have taken part of a multi-byte char. we need to make
241	# sure we return the entire character
242	if (defined($b3) && $b3 < 0x80) { # we have all we need
243	$handle->ungetc($b3);
244	return ($b1 . $b2);
245	}
246	my $c=$b1.$b2.$b3;
247	my $b4=$handle->getc();
248	while ($b4 > 0x7f) { # note - this will return consecutive mb utf8 chars
249	$c .= $b4;
250	$b4=$handle->getc();
251	if (eof($handle)) { last }
252	}
253	if (! eof($handle)) {
254	$handle->ungetc($b4); # this byte is an ascii byte
255	}
256	return $c;
257	}
258
259
260
261	# will convert entire contents of file to utf8 and append result to $outputref
262	# this may be a slightly faster way to get the contents of a file than by
263	# recursively calling read_line()
264	sub read_file {
265	my $self = shift (@_);
266	my ($outputref) = @_;
267
268	# make sure we have a file handle
269	return if ($self->{'handle'} eq "");
270
271	my $handle = $self->{'handle'};
272
273	# if encoding is set to utf8 or unicode, sniff to see if there is a
274	# byte order marker
275	if ($self->{'first'} &&
276	($self->{'encoding'} eq "utf8" \|\| $self->{'encoding'} eq 'unicode')) {
277	# this will change $self's encoding if there is a BOM
278	my $read_text = $self->find_unicode_bom();
279	$$outputref .= $read_text if (defined($read_text));
280	}
281
282	if ($self->{'encoding'} eq "utf8") {
283	undef $/;
284	$$outputref .= <$handle>;
285	$/ = "\n";
286	return;
287	}
288
289	if ($self->{'encoding'} eq "unicode") {
290	my $line = "";
291	while (defined ($line = $self->read_line())) {
292	$$outputref .= $line;
293	}
294	return;
295	}
296
297	if ($self->{'encoding'} eq "iso_8859_1" \|\| $self->{'encoding'} eq "ascii") {
298	# we'll use ascii2utf8() for this as it's faster than going
299	# through convert2unicode()
300	undef $/;
301	my $text = <$handle>;
302	$/ = "\n";
303	$$outputref .= &unicode::ascii2utf8 (\$text);
304	return;
305	}
306
307	# everything else uses unicode::convert2unicode
308	undef $/;
309	my $text = <$handle>;
310	$/ = "\n";
311	$$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text));
312	}
313
314	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: