Context Navigation

source: gsdl/trunk/perllib/multiread.pm@ 18342

Last change on this file since 18342 was 16793, checked in by davidb, 16 years ago
Double nested function call broken down in to two separate lines to help debugging
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 8.5 KB

Line
1	###########################################################################
2	#
3	# multiread.pm --
4	#
5	# Copyright (C) 1999 DigiLib Systems Limited, NZ
6	# Copyright (C) 2005 New Zealand Digital Library project
7	#
8	# This program is free software; you can redistribute it and/or modify
9	# it under the terms of the GNU General Public License as published by
10	# the Free Software Foundation; either version 2 of the License, or
11	# (at your option) any later version.
12	#
13	# This program is distributed in the hope that it will be useful,
14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	# GNU General Public License for more details.
17	#
18	# You should have received a copy of the GNU General Public License
19	# along with this program; if not, write to the Free Software
20	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21	#
22	###########################################################################
23
24	# the multiread object will read in a number of encodings,
25	# the results are always returned in the utf-8 format
26
27	# encodings currently supported are
28	#
29	# utf8 - either utf8 or unicode (automatically detected)
30	# unicode - 2-byte UCS (does endian detection)
31	#
32	# plus all encodings in the "encodings" package
33
34	package multiread;
35
36	eval {require bytes};
37
38	use strict;
39	no strict 'refs'; # allow filehandles to be variables and viceversa
40
41	use unicode;
42
43	sub new {
44	my ($class) = @_;
45
46	my $self = {'handle' => "",
47	'first' => 1,
48	'encoding' => "utf8",
49	'bigendian' => 1};
50
51	return bless $self, $class;
52	}
53
54	# set_handle expects the file to be already open but
55	# not read yet
56	sub set_handle {
57	my $self = shift;
58	$self->{'handle'} = shift;
59	binmode( $self->{'handle'} );
60	$self->{'first'} = 1;
61	$self->{'encoding'} = "utf8";
62	$self->{'bigendian'} = 1;
63	}
64
65	# set_encoding should be called after set_handle
66	sub set_encoding {
67	my $self = shift;
68	$self->{'encoding'} = shift;
69	}
70
71	sub get_encoding {
72	my $self = shift (@_);
73	return $self->{'encoding'};
74	}
75
76	# undef will be returned if the eof has been reached
77	# the result will always be returned in utf-8
78
79	sub read_unicode_char {
80	my $self = shift (@_);
81
82	# make sure we have a file handle
83	return undef if ($self->{'handle'} eq "");
84	my $handle = $self->{'handle'};
85
86	if ($self->{'encoding'} eq "utf8") {
87	# utf-8 text, how many characters we get depends
88	# on what we find
89	my $c1 = "";
90	my $c2 = "";
91	my $c3 = "";
92
93	while (!eof ($handle)) {
94	$c1 = ord (getc ($handle));
95
96	if ($c1 <= 0x7f) {
97	# one byte character
98	return chr ($c1);
99
100	} elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
101	# two byte character
102	$c2 = getc ($handle) if (!eof ($handle));
103	return chr ($c1) . $c2;
104
105	} elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
106	# three byte character
107	$c2 = getc ($handle) if (!eof ($handle));
108	$c3 = getc ($handle) if (!eof ($handle));
109	return chr ($c1) . $c2 . $c3;
110	}
111
112	# if we get here there was an error in the file, we should
113	# be able to recover from it however, maybe the file is in
114	# another encoding
115	}
116
117	return undef if (eof ($handle));
118	}
119
120	if ($self->{'encoding'} eq "unicode") {
121	# unicode text, get the next two characters
122	return undef if (eof ($handle));
123	my $c1 = ord (getc ($handle));
124	return undef if (eof ($handle));
125	my $c2 = ord (getc ($handle));
126
127	return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
128	}
129
130	return undef;
131	}
132
133
134	sub unicodechar_to_ord
135	{
136	my $self = shift (@_);
137	my ($unicode_text) = @_;
138
139	my $bigendian_ord_array = [];
140
141	my @unicodechar_array = ($unicode_text =~ m/(..)/g);
142
143	foreach my $pair (@unicodechar_array) {
144	# for each 2 byte pair
145	my $c1=ord(substr($pair,0,1));
146	my $c2=ord(substr($pair,1,1));
147
148	my $be_ord = ($self->{'bigendian'}) ? $c1256+$c2 : $c2256+$c1;
149	push(@$bigendian_ord_array,$be_ord);
150	}
151
152	return $bigendian_ord_array;
153	}
154
155
156	# undef will be returned if the eof has been reached
157	# the result will always be returned in utf-8
158	sub read_line {
159	my $self = shift (@_);
160
161	# make sure we have a file handle
162	return undef if ($self->{'handle'} eq "");
163
164	my $handle = $self->{'handle'};
165
166	if ($self->{'encoding'} eq "utf8") {
167	# utf-8 line
168	return <$handle>;
169	}
170
171	if ($self->{'encoding'} eq "unicode") {
172	# unicode line
173	my $c = "";
174	my ($c1, $c2) = ("", "");
175	my $out = "";
176	while (read ($handle, $c, 2) == 2) {
177	$c1 = ord (substr ($c, 0, 1));
178	$c2 = ord (substr ($c, 1, 1));
179	$c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
180	$out .= $c;
181	last if ($c eq "\n");
182	}
183
184	return $out if (length ($out) > 0);
185	return undef;
186	}
187
188	if ($self->{'encoding'} eq "iso_8859_1") {
189	# we'll use ascii2utf8() for this as it's faster than going
190	# through convert2unicode()
191	my $line = "";
192	if (defined ($line = <$handle>)) {
193	return &unicode::ascii2utf8 (\$line);
194	}
195	}
196
197	# everything else uses unicode::convert2unicode
198	my $line = "";
199	if (defined ($line = <$handle>)) {
200	return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
201	}
202
203	return undef;
204	}
205
206
207
208	# this will look for a Byte Order Marker at the start of the file, and
209	# set the encoding appropriately if there is one, returning any
210	# non-marker text on the first line (or returns undef).
211	sub find_unicode_bom {
212	my $self=shift;
213
214	my $non_bom_text=""; # to return if we read in 'real' text
215
216	if ($self->{'first'} == 0) { return }
217
218	# make sure we have a file handle
219	return if ($self->{'handle'} eq "");
220	my $handle = $self->{'handle'};
221
222	$self->{'first'} = 0;
223
224	my $gc = getc ($handle);
225	my $b1 = ord($gc);
226	my $b2;
227	my $b3;
228
229	if ($b1 == 0xfe \|\| $b1 == 0xff) {
230	$b2 = ord (getc ($handle)) if (!eof ($handle));
231	if ($b1 == 0xff && $b2 == 0xfe) {
232	$self->{'encoding'} = "unicode";
233	$self->{'bigendian'} = 0;
234	return;
235	} elsif ($b1 == 0xfe && $b2 == 0xff) {
236	$self->{'encoding'} = "unicode";
237	$self->{'bigendian'} = 1;
238	return;
239	} elsif ($b1 == 0xef && $b2 == 0xbb) {
240	$b3 = ord(getc($handle));
241	if ($b3 == 0xbf) {
242	$self->{'encoding'} = "utf8";
243	$self->{'bigendian'} = 1;
244	return;
245	}
246	else {
247	# put back all three bytes
248	$handle->ungetc($b3);
249	$handle->ungetc($b2);
250	$handle->ungetc($b1); return;
251
252	}
253	}
254	else {
255	# put back two bytes read
256	$handle->ungetc($b2);
257	$handle->ungetc($b1); return;
258	}
259	} else { # $b1 != fe or ff
260	# put back the one byte read
261	$handle->ungetc($b1); return;
262	}
263	}
264
265
266	sub read_file_no_decoding
267	{
268	my $self = shift (@_);
269	my ($outputref) = @_;
270
271	# make sure we have a file handle
272	return if ($self->{'handle'} eq "");
273
274	my $handle = $self->{'handle'};
275
276	# if encoding is set to utf8 or unicode, sniff to see if there is a
277	# byte order marker
278	if ($self->{'first'} &&
279	($self->{'encoding'} eq "utf8" \|\| $self->{'encoding'} eq 'unicode')) {
280
281	# this will change $self's encoding if there is a BOM (but won't consume any characters)
282	$self->find_unicode_bom();
283	}
284
285	undef $/;
286	$$outputref .= <$handle>;
287	$/ = "\n";
288	}
289
290
291	# will convert entire contents of file to utf8 and append result to $outputref
292	# this may be a slightly faster way to get the contents of a file than by
293	# recursively calling read_line()
294	sub decode_text {
295	my $self = shift (@_);
296
297	my ($raw_text,$decoded_text_ref) = @_;
298
299	if ($self->{'encoding'} eq "utf8") {
300	# Nothing to do, raw text is in utf 8
301	$$decoded_text_ref .= $raw_text;
302	return;
303	}
304
305	if ($self->{'encoding'} eq "unicode") {
306	my $unicode_array = $self->unicodechar_to_ord($raw_text);
307	$$decoded_text_ref .= &unicode::unicode2utf8($unicode_array);
308	return;
309	}
310
311	if ($self->{'encoding'} eq "iso_8859_1" \|\| $self->{'encoding'} eq "ascii") {
312	# we'll use ascii2utf8() for this as it's faster than going
313	# through convert2unicode()
314	$$decoded_text_ref .= &unicode::ascii2utf8 (\$raw_text);
315	return;
316	}
317
318	# everything else uses unicode::convert2unicode
319	$$decoded_text_ref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$raw_text));
320	}
321
322
323
324	# will convert entire contents of file to utf8 and append result to $outputref
325	# this may be a slightly faster way to get the contents of a file than by
326	# recursively calling read_line()
327	sub read_file {
328	my $self = shift (@_);
329	my ($outputref) = @_;
330
331	# While unusual, $raw_text is initialized to $$outputref
332	# to be consistent with code before refactoring
333	my $raw_text = $$outputref;
334
335	$self->read_file_no_decoding(\$raw_text);
336	$self->decode_text($raw_text,$outputref);
337	}
338
339	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: