Context Navigation

source: main/tags/2.52/gsdl/perllib/multiread.pm@ 25422

Last change on this file since 25422 was 3834, checked in by sjboddie, 21 years ago
Prevent "use bytes" from causing errors for older perls
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.9 KB

Line
1	###########################################################################
2	#
3	# multiread.pm --
4	#
5	# Copyright (C) 1999 DigiLib Systems Limited, NZ
6	#
7	# This program is free software; you can redistribute it and/or modify
8	# it under the terms of the GNU General Public License as published by
9	# the Free Software Foundation; either version 2 of the License, or
10	# (at your option) any later version.
11	#
12	# This program is distributed in the hope that it will be useful,
13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	# GNU General Public License for more details.
16	#
17	# You should have received a copy of the GNU General Public License
18	# along with this program; if not, write to the Free Software
19	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20	#
21	###########################################################################
22
23	# the multiread object will read in a number of encodings,
24	# the results are always returned in the utf-8 format
25
26	# encodings currently supported are
27	#
28	# utf8 - either utf8 or unicode (automatically detected)
29	# unicode - just unicode (doesn't currently do endian detection)
30	#
31	# plus all encodings in the "encodings" package
32
33	package multiread;
34
35	eval {require bytes};
36
37	use unicode;
38
39	sub new {
40	my ($class) = @_;
41
42	my $self = {'handle' => "",
43	'first' => 1,
44	'encoding' => "utf8",
45	'bigendian' => 1};
46
47	return bless $self, $class;
48	}
49
50	# set_handle expects the file to be already open but
51	# not read yet
52	sub set_handle {
53	my $self = shift (@_);
54	($self->{'handle'}) = @_;
55	$self->{'first'} = 1;
56	$self->{'encoding'} = "utf8";
57	$self->{'bigendian'} = 1;
58	}
59
60	# set_encoding should be called after set_handle
61	sub set_encoding {
62	my $self = shift (@_);
63	($self->{'encoding'}) = @_;
64	}
65
66	sub get_encoding {
67	my $self = shift (@_);
68	return $self->{'encoding'};
69	}
70
71	# undef will be returned if the eof has been reached
72	# the result will always be returned in utf-8
73	# if automatic detection between utf8 and unicode is desired
74	# then the encoding should be initially set to utf8
75	sub read_unicode_char {
76	my $self = shift (@_);
77
78	# make sure we have a file handle
79	return undef if ($self->{'handle'} eq "");
80	my $handle = $self->{'handle'};
81	binmode ($handle);
82
83	if ($self->{'encoding'} eq "utf8") {
84	# utf-8 text, how many characters we get depends
85	# on what we find
86	my $c1 = "";
87	my $c2 = "";
88	my $c3 = "";
89
90	while (!eof ($handle)) {
91	$c1 = ord (getc ($handle));
92	if ($self->{'first'}) {
93	$self->{'first'} = 0;
94
95	if ($c1 == 0xfe \|\| $c1 == 0xff) {
96	$c2 = ord (getc ($handle)) if (!eof ($handle));
97
98	# if unicode fall through to the unicode reading code
99	if ($c1 == 0xff && $c2 == 0xfe) {
100	$self->{'encoding'} = "unicode";
101	$self->{'bigendian'} = 0;
102	last;
103
104	} elsif ($c1 == 0xfe && $c2 == 0xff) {
105	$self->{'encoding'} = "unicode";
106	$self->{'bigendian'} = 1;
107	last;
108	}
109
110	# an error, but we might be able to recover
111	# from it
112	$c1 = $c2;
113	}
114	}
115
116	if ($c1 <= 0x7f) {
117	# one byte character
118	return chr ($c1);
119
120	} elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
121	# two byte character
122	$c2 = getc ($handle) if (!eof ($handle));
123	return chr ($c1) . $c2;
124
125	} elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
126	# three byte character
127	$c2 = getc ($handle) if (!eof ($handle));
128	$c3 = getc ($handle) if (!eof ($handle));
129	return chr ($c1) . $c2 . $c3;
130	}
131
132	# if we get here there was an error in the file, we should
133	# be able to recover from it however, maybe the file is in
134	# another encoding
135	}
136
137	return undef if (eof ($handle));
138	}
139
140	if ($self->{'encoding'} eq "unicode") {
141	# unicode text, get the next two characters
142	return undef if (eof ($handle));
143	my $c1 = ord (getc ($handle));
144	return undef if (eof ($handle));
145	my $c2 = ord (getc ($handle));
146
147	return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
148	}
149
150	return undef;
151	}
152
153
154	# undef will be returned if the eof has been reached
155	# the result will always be returned in utf-8
156	sub read_line {
157	my $self = shift (@_);
158
159	# make sure we have a file handle
160	return undef if ($self->{'handle'} eq "");
161
162	my $handle = $self->{'handle'};
163
164	if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
165	# special case for the first line of utf8 text to detect whether
166	# the file is in utf8 or unicode
167	my $out = "";
168	my $thisc = "";
169	while (defined ($thisc = $self->read_unicode_char())) {
170	$out .= $thisc;
171	last if ($thisc eq "\n");
172	}
173
174	return $out if (length ($out) > 0);
175	return undef;
176	}
177
178	if ($self->{'encoding'} eq "utf8") {
179	# utf-8 line
180	return <$handle>;
181	}
182
183	if ($self->{'encoding'} eq "unicode") {
184	# unicode line
185	my $c = "";
186	my ($c1, $c2) = ("", "");
187	my $out = "";
188	while (read ($handle, $c, 2) == 2) {
189	$c1 = ord (substr ($c, 0, 1));
190	$c2 = ord (substr ($c, 1, 1));
191	$c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
192	$out .= $c;
193	last if ($c eq "\n");
194	}
195
196	return $out if (length ($out) > 0);
197	return undef;
198	}
199
200	if ($self->{'encoding'} eq "iso_8859_1") {
201	# we'll use ascii2utf8() for this as it's faster than going
202	# through convert2unicode()
203	my $line = "";
204	if (defined ($line = <$handle>)) {
205	return &unicode::ascii2utf8 (\$line);
206	}
207	}
208
209	# everything else uses unicode::convert2unicode
210	my $line = "";
211	if (defined ($line = <$handle>)) {
212	return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
213	}
214
215	return undef;
216	}
217
218
219	# will convert entire contents of file to utf8 and append result to $outputref
220	# this may be a slightly faster way to get the contents of a file than by
221	# recursively calling read_line()
222	sub read_file {
223	my $self = shift (@_);
224	my ($outputref) = @_;
225
226	# make sure we have a file handle
227	return if ($self->{'handle'} eq "");
228
229	my $handle = $self->{'handle'};
230
231	if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
232	# special case for the first line of utf8 text to detect whether
233	# the file is in utf8 or unicode
234
235	# possible to have no text here...
236	my $read_text = $self->read_line ();
237	$$outputref .= $read_text if (defined($read_text));
238	}
239
240	if ($self->{'encoding'} eq "utf8") {
241	undef $/;
242	$$outputref .= <$handle>;
243	$/ = "\n";
244	return;
245	}
246
247	if ($self->{'encoding'} eq "unicode") {
248	my $line = "";
249	while (defined ($line = $self->read_line())) {
250	$$outputref .= $line;
251	}
252	return;
253	}
254
255	if ($self->{'encoding'} eq "iso_8859_1") {
256	# we'll use ascii2utf8() for this as it's faster than going
257	# through convert2unicode()
258	undef $/;
259	my $text = <$handle>;
260	$/ = "\n";
261	$$outputref .= &unicode::ascii2utf8 (\$text);
262	return;
263	}
264
265	# everything else uses unicode::convert2unicode
266	undef $/;
267	my $text = <$handle>;
268	$/ = "\n";
269	$$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text));
270	}
271
272	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: