Context Navigation

source: gsdl/trunk/perllib/multiread.pm@ 14926

Last change on this file since 14926 was 12832, checked in by kjdon, 18 years ago
added in ascii casee in read_file - if not done specially, will be used in a call to unicode::unicode2utf8, which will fail as it doesn't accept ascii
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 8.1 KB

Line
1	###########################################################################
2	#
3	# multiread.pm --
4	#
5	# Copyright (C) 1999 DigiLib Systems Limited, NZ
6	# Copyright (C) 2005 New Zealand Digital Library project
7	#
8	# This program is free software; you can redistribute it and/or modify
9	# it under the terms of the GNU General Public License as published by
10	# the Free Software Foundation; either version 2 of the License, or
11	# (at your option) any later version.
12	#
13	# This program is distributed in the hope that it will be useful,
14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	# GNU General Public License for more details.
17	#
18	# You should have received a copy of the GNU General Public License
19	# along with this program; if not, write to the Free Software
20	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21	#
22	###########################################################################
23
24	# the multiread object will read in a number of encodings,
25	# the results are always returned in the utf-8 format
26
27	# encodings currently supported are
28	#
29	# utf8 - either utf8 or unicode (automatically detected)
30	# unicode - 2-byte UCS (does endian detection)
31	#
32	# plus all encodings in the "encodings" package
33
34	package multiread;
35
36	eval {require bytes};
37
38	use unicode;
39
40	sub new {
41	my ($class) = @_;
42
43	my $self = {'handle' => "",
44	'first' => 1,
45	'encoding' => "utf8",
46	'bigendian' => 1};
47
48	return bless $self, $class;
49	}
50
51	# set_handle expects the file to be already open but
52	# not read yet
53	sub set_handle {
54	my $self = shift;
55	$self->{'handle'} = shift;
56	binmode( $self->{'handle'} );
57	$self->{'first'} = 1;
58	$self->{'encoding'} = "utf8";
59	$self->{'bigendian'} = 1;
60	}
61
62	# set_encoding should be called after set_handle
63	sub set_encoding {
64	my $self = shift;
65	$self->{'encoding'} = shift;
66	}
67
68	sub get_encoding {
69	my $self = shift (@_);
70	return $self->{'encoding'};
71	}
72
73	# undef will be returned if the eof has been reached
74	# the result will always be returned in utf-8
75
76	sub read_unicode_char {
77	my $self = shift (@_);
78
79	# make sure we have a file handle
80	return undef if ($self->{'handle'} eq "");
81	my $handle = $self->{'handle'};
82
83	if ($self->{'encoding'} eq "utf8") {
84	# utf-8 text, how many characters we get depends
85	# on what we find
86	my $c1 = "";
87	my $c2 = "";
88	my $c3 = "";
89
90	while (!eof ($handle)) {
91	$c1 = ord (getc ($handle));
92
93	if ($c1 <= 0x7f) {
94	# one byte character
95	return chr ($c1);
96
97	} elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
98	# two byte character
99	$c2 = getc ($handle) if (!eof ($handle));
100	return chr ($c1) . $c2;
101
102	} elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
103	# three byte character
104	$c2 = getc ($handle) if (!eof ($handle));
105	$c3 = getc ($handle) if (!eof ($handle));
106	return chr ($c1) . $c2 . $c3;
107	}
108
109	# if we get here there was an error in the file, we should
110	# be able to recover from it however, maybe the file is in
111	# another encoding
112	}
113
114	return undef if (eof ($handle));
115	}
116
117	if ($self->{'encoding'} eq "unicode") {
118	# unicode text, get the next two characters
119	return undef if (eof ($handle));
120	my $c1 = ord (getc ($handle));
121	return undef if (eof ($handle));
122	my $c2 = ord (getc ($handle));
123
124	return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
125	}
126
127	return undef;
128	}
129
130
131	# undef will be returned if the eof has been reached
132	# the result will always be returned in utf-8
133	sub read_line {
134	my $self = shift (@_);
135
136	# make sure we have a file handle
137	return undef if ($self->{'handle'} eq "");
138
139	my $handle = $self->{'handle'};
140
141	if ($self->{'encoding'} eq "utf8") {
142	# utf-8 line
143	return <$handle>;
144	}
145
146	if ($self->{'encoding'} eq "unicode") {
147	# unicode line
148	my $c = "";
149	my ($c1, $c2) = ("", "");
150	my $out = "";
151	while (read ($handle, $c, 2) == 2) {
152	$c1 = ord (substr ($c, 0, 1));
153	$c2 = ord (substr ($c, 1, 1));
154	$c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
155	$out .= $c;
156	last if ($c eq "\n");
157	}
158
159	return $out if (length ($out) > 0);
160	return undef;
161	}
162
163	if ($self->{'encoding'} eq "iso_8859_1") {
164	# we'll use ascii2utf8() for this as it's faster than going
165	# through convert2unicode()
166	my $line = "";
167	if (defined ($line = <$handle>)) {
168	return &unicode::ascii2utf8 (\$line);
169	}
170	}
171
172	# everything else uses unicode::convert2unicode
173	my $line = "";
174	if (defined ($line = <$handle>)) {
175	return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
176	}
177
178	return undef;
179	}
180
181
182
183	# this will look for a Byte Order Marker at the start of the file, and
184	# set the encoding appropriately if there is one, returning any
185	# non-marker text on the first line (or returns undef).
186	sub find_unicode_bom {
187	my $self=shift;
188
189	my $non_bom_text=""; # to return if we read in 'real' text
190
191	if ($self->{'first'} == 0) { return }
192
193	# make sure we have a file handle
194	return if ($self->{'handle'} eq "");
195	my $handle = $self->{'handle'};
196
197	$self->{'first'} = 0;
198
199	my $b1 = ord(getc ($handle));
200	my $b2;
201	my $b3;
202
203	if ($b1 == 0xfe \|\| $b1 == 0xff) {
204	$b2 = ord (getc ($handle)) if (!eof ($handle));
205	if ($b1 == 0xff && $b2 == 0xfe) {
206	$self->{'encoding'} = "unicode";
207	$self->{'bigendian'} = 0;
208	return;
209	} elsif ($b1 == 0xfe && $b2 == 0xff) {
210	$self->{'encoding'} = "unicode";
211	$self->{'bigendian'} = 1;
212	return;
213	} elsif ($b1 == 0xef && $b2 == 0xbb) {
214	$b3 = ord(getc($handle));
215	if ($b3 == 0xbf) {
216	$self->{'encoding'} = "utf8";
217	$self->{'bigendian'} = 1;
218	return;
219	}
220	}
221	} else { # $b1 != fe or ff
222	$handle->ungetc($b1); return;
223	}
224	# if here, we have removed some chars and they aren't a BOM
225	if ($self->{'encoding'} eq "unicode") { # return the 2byte char
226	if (defined ($b3)) { # we looked at this... return it
227	$handle->ungetc($b3);
228	}
229	return &unicode::unicode2utf8([$self->{'bigendian'}?
230	($b1256+$b2) : ($b2256+$b1)]);
231	}
232	# if here, it's utf-8
233	if ($b2 < 0x80) {
234	if (defined ($b3)) { # we grabbed this, but don't need it now
235	$handle->ungetc($b3);
236	}
237	return ($b1 . $b2);
238	}
239	# if here, we have taken part of a multi-byte char. we need to make
240	# sure we return the entire character
241	if (defined($b3) && $b3 < 0x80) { # we have all we need
242	$handle->ungetc($b3);
243	return ($b1 . $b2);
244	}
245	my $c=$b1.$b2.$b3;
246	my $b4=$handle->getc();
247	while ($b4 > 0x7f) { # note - this will return consecutive mb utf8 chars
248	$c .= $b4;
249	$b4=$handle->getc();
250	if (eof($handle)) { last }
251	}
252	if (! eof($handle)) {
253	$handle->ungetc($b4); # this byte is an ascii byte
254	}
255	return $c;
256	}
257
258
259
260	# will convert entire contents of file to utf8 and append result to $outputref
261	# this may be a slightly faster way to get the contents of a file than by
262	# recursively calling read_line()
263	sub read_file {
264	my $self = shift (@_);
265	my ($outputref) = @_;
266
267	# make sure we have a file handle
268	return if ($self->{'handle'} eq "");
269
270	my $handle = $self->{'handle'};
271
272	# if encoding is set to utf8 or unicode, sniff to see if there is a
273	# byte order marker
274	if ($self->{'first'} &&
275	($self->{'encoding'} eq "utf8" \|\| $self->{'encoding'} eq 'unicode')) {
276	# this will change $self's encoding if there is a BOM
277	my $read_text = $self->find_unicode_bom();
278	$$outputref .= $read_text if (defined($read_text));
279	}
280
281	if ($self->{'encoding'} eq "utf8") {
282	undef $/;
283	$$outputref .= <$handle>;
284	$/ = "\n";
285	return;
286	}
287
288	if ($self->{'encoding'} eq "unicode") {
289	my $line = "";
290	while (defined ($line = $self->read_line())) {
291	$$outputref .= $line;
292	}
293	return;
294	}
295
296	if ($self->{'encoding'} eq "iso_8859_1" \|\| $self->{'encoding'} eq "ascii") {
297	# we'll use ascii2utf8() for this as it's faster than going
298	# through convert2unicode()
299	undef $/;
300	my $text = <$handle>;
301	$/ = "\n";
302	$$outputref .= &unicode::ascii2utf8 (\$text);
303	return;
304	}
305
306	# everything else uses unicode::convert2unicode
307	undef $/;
308	my $text = <$handle>;
309	$/ = "\n";
310	$$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text));
311	}
312
313	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: