Context Navigation

source: tags/gsdl-2_30d-distribution/gsdl/perllib/multiread.pm@ 2308

Last change on this file since 2308 was 1870, checked in by sjboddie, 23 years ago
Tidied up language support stuff.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.8 KB

Line
1	###########################################################################
2	#
3	# multiread.pm --
4	#
5	# Copyright (C) 1999 DigiLib Systems Limited, NZ
6	#
7	# This program is free software; you can redistribute it and/or modify
8	# it under the terms of the GNU General Public License as published by
9	# the Free Software Foundation; either version 2 of the License, or
10	# (at your option) any later version.
11	#
12	# This program is distributed in the hope that it will be useful,
13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	# GNU General Public License for more details.
16	#
17	# You should have received a copy of the GNU General Public License
18	# along with this program; if not, write to the Free Software
19	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20	#
21	###########################################################################
22
23	# the multiread object will read in a number of encodings,
24	# the results are always returned in the utf-8 format
25
26	# encodings currently supported are
27	#
28	# utf8 - either utf8 or unicode (automatically detected)
29	# unicode - just unicode (doesn't currently do endian detection)
30	#
31	# plus all encodings in the "encodings" package
32
33	package multiread;
34
35	use unicode;
36
37	sub new {
38	my ($class) = @_;
39
40	my $self = {'handle' => "",
41	'first' => 1,
42	'encoding' => "utf8",
43	'bigendian' => 1};
44
45	return bless $self, $class;
46	}
47
48	# set_handle expects the file to be already open but
49	# not read yet
50	sub set_handle {
51	my $self = shift (@_);
52	($self->{'handle'}) = @_;
53	$self->{'first'} = 1;
54	$self->{'encoding'} = "utf8";
55	$self->{'bigendian'} = 1;
56	}
57
58	# set_encoding should be called after set_handle
59	sub set_encoding {
60	my $self = shift (@_);
61	($self->{'encoding'}) = @_;
62	}
63
64	sub get_encoding {
65	my $self = shift (@_);
66	return $self->{'encoding'};
67	}
68
69	# undef will be returned if the eof has been reached
70	# the result will always be returned in utf-8
71	# if automatic detection between utf8 and unicode is desired
72	# then the encoding should be initially set to utf8
73	sub read_unicode_char {
74	my $self = shift (@_);
75
76	# make sure we have a file handle
77	return undef if ($self->{'handle'} eq "");
78	my $handle = $self->{'handle'};
79	binmode ($handle);
80
81	if ($self->{'encoding'} eq "utf8") {
82	# utf-8 text, how many characters we get depends
83	# on what we find
84	my $c1 = "";
85	my $c2 = "";
86	my $c3 = "";
87
88	while (!eof ($handle)) {
89	$c1 = ord (getc ($handle));
90	if ($self->{'first'}) {
91	$self->{'first'} = 0;
92
93	if ($c1 == 0xfe \|\| $c1 == 0xff) {
94	$c2 = ord (getc ($handle)) if (!eof ($handle));
95
96	# if unicode fall through to the unicode reading code
97	if ($c1 == 0xff && $c2 == 0xfe) {
98	$self->{'encoding'} = "unicode";
99	$self->{'bigendian'} = 0;
100	last;
101
102	} elsif ($c1 == 0xfe && $c2 == 0xff) {
103	$self->{'encoding'} = "unicode";
104	$self->{'bigendian'} = 1;
105	last;
106	}
107
108	# an error, but we might be able to recover
109	# from it
110	$c1 = $c2;
111	}
112	}
113
114	if ($c1 <= 0x7f) {
115	# one byte character
116	return chr ($c1);
117
118	} elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
119	# two byte character
120	$c2 = getc ($handle) if (!eof ($handle));
121	return chr ($c1) . $c2;
122
123	} elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
124	# three byte character
125	$c2 = getc ($handle) if (!eof ($handle));
126	$c3 = getc ($handle) if (!eof ($handle));
127	return chr ($c1) . $c2 . $c3;
128	}
129
130	# if we get here there was an error in the file, we should
131	# be able to recover from it however, maybe the file is in
132	# another encoding
133	}
134
135	return undef if (eof ($handle));
136	}
137
138	if ($self->{'encoding'} eq "unicode") {
139	# unicode text, get the next two characters
140	return undef if (eof ($handle));
141	my $c1 = ord (getc ($handle));
142	return undef if (eof ($handle));
143	my $c2 = ord (getc ($handle));
144
145	return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
146	}
147
148	return undef;
149	}
150
151
152	# undef will be returned if the eof has been reached
153	# the result will always be returned in utf-8
154	sub read_line {
155	my $self = shift (@_);
156
157	# make sure we have a file handle
158	return undef if ($self->{'handle'} eq "");
159
160	my $handle = $self->{'handle'};
161
162	if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
163	# special case for the first line of utf8 text to detect whether
164	# the file is in utf8 or unicode
165	my $out = "";
166	my $thisc = "";
167	while (defined ($thisc = $self->read_unicode_char())) {
168	$out .= $thisc;
169	last if ($thisc eq "\n");
170	}
171
172	return $out if (length ($out) > 0);
173	return undef;
174	}
175
176	if ($self->{'encoding'} eq "utf8") {
177	# utf-8 line
178	return <$handle>;
179	}
180
181	if ($self->{'encoding'} eq "unicode") {
182	# unicode line
183	my $c = "";
184	my ($c1, $c2) = ("", "");
185	my $out = "";
186	while (read ($handle, $c, 2) == 2) {
187	$c1 = ord (substr ($c, 0, 1));
188	$c2 = ord (substr ($c, 1, 1));
189	$c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
190	$out .= $c;
191	last if ($c eq "\n");
192	}
193
194	return $out if (length ($out) > 0);
195	return undef;
196	}
197
198	if ($self->{'encoding'} eq "iso_8859_1") {
199	# we'll use ascii2utf8() for this as it's faster than going
200	# through convert2unicode()
201	my $line = "";
202	if (defined ($line = <$handle>)) {
203	return &unicode::ascii2utf8 (\$line);
204	}
205	}
206
207	# everything else uses unicode::convert2unicode
208	my $line = "";
209	if (defined ($line = <$handle>)) {
210	return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
211	}
212
213	return undef;
214	}
215
216
217	# will convert entire contents of file to utf8 and append result to $outputref
218	# this may be a slightly faster way to get the contents of a file than by
219	# recursively calling read_line()
220	sub read_file {
221	my $self = shift (@_);
222	my ($outputref) = @_;
223
224	# make sure we have a file handle
225	return if ($self->{'handle'} eq "");
226
227	my $handle = $self->{'handle'};
228
229	if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
230	# special case for the first line of utf8 text to detect whether
231	# the file is in utf8 or unicode
232	$$text .= $self->read_line ();
233	}
234
235	if ($self->{'encoding'} eq "utf8") {
236	undef $/;
237	$$outputref .= <$handle>;
238	$/ = "\n";
239	return;
240	}
241
242	if ($self->{'encoding'} eq "unicode") {
243	my $line = "";
244	while (defined ($line = $self->read_line())) {
245	$$outputref .= $line;
246	}
247	return;
248	}
249
250	if ($self->{'encoding'} eq "iso_8859_1") {
251	# we'll use ascii2utf8() for this as it's faster than going
252	# through convert2unicode()
253	undef $/;
254	my $text = <$handle>;
255	$/ = "\n";
256	$$outputref .= &unicode::ascii2utf8 (\$text);
257	return;
258	}
259
260	# everything else uses unicode::convert2unicode
261	undef $/;
262	my $text = <$handle>;
263	$/ = "\n";
264	$$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text));
265	}
266
267	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: