Context Navigation

source: branches/New_Config_Format-branch/gsdl/perllib/multiread.pm@ 1279

Last change on this file since 1279 was 1279, checked in by sjboddie, 24 years ago
merged changes to trunk into New_Config_Format branch
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 8.6 KB

Line
1	###########################################################################
2	#
3	# multiread.pm --
4	#
5	# Copyright (C) 1999 DigiLib Systems Limited, NZ
6	#
7	# This program is free software; you can redistribute it and/or modify
8	# it under the terms of the GNU General Public License as published by
9	# the Free Software Foundation; either version 2 of the License, or
10	# (at your option) any later version.
11	#
12	# This program is distributed in the hope that it will be useful,
13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	# GNU General Public License for more details.
16	#
17	# You should have received a copy of the GNU General Public License
18	# along with this program; if not, write to the Free Software
19	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20	#
21	###########################################################################
22
23	# the multiread object will read in a number of encodings,
24	# the results are always returned in the utf-8 format
25
26	# encodings currently supported are
27	#
28	# utf8 - either utf8 or unicode (automatically detected)
29	# unicode - just unicode (doesn't currently do endian detection)
30	# gb - GB
31	# iso_8859_1 - extended ascii (iso-8859-1)
32	# iso_8859_6 - 8 bit arabic (iso-8859-6)
33	# windows_1256 - Windows codepage 1256 (Arabic)
34
35	package multiread;
36
37	use unicode;
38	use gb;
39
40	sub new {
41	my ($class) = @_;
42
43	my $self = {'handle' => "",
44	'first' => 1,
45	'encoding' => "utf8",
46	'bigendian' => 1};
47
48	return bless $self, $class;
49	}
50
51	# set_handle expects the file to be already open but
52	# not read yet
53	sub set_handle {
54	my $self = shift (@_);
55	($self->{'handle'}) = @_;
56	$self->{'first'} = 1;
57	$self->{'encoding'} = "utf8";
58	$self->{'bigendian'} = 1;
59	}
60
61	# set_encoding should be called after set_handle
62	sub set_encoding {
63	my $self = shift (@_);
64	($self->{'encoding'}) = @_;
65	}
66
67	sub get_encoding {
68	my $self = shift (@_);
69	return $self->{'encoding'};
70	}
71
72	# undef will be returned if the eof has been reached
73	# the result will always be returned in utf-8
74	# if automatic detection between utf8 and unicode is desired
75	# then the encoding should be initially set to utf8
76	sub read_char {
77	my $self = shift (@_);
78
79	# make sure we have a file handle
80	return undef if ($self->{'handle'} eq "");
81	my $handle = $self->{'handle'};
82
83	if ($self->{'encoding'} eq "utf8") {
84	# utf-8 text, how many characters we get depends
85	# on what we find
86	my $c1 = "";
87	my $c2 = "";
88	my $c3 = "";
89
90	while (!eof ($handle)) {
91	$c1 = ord (getc ($handle));
92	if ($self->{'first'}) {
93	$self->{'first'} = 0;
94
95	if ($c1 == 0xfe \|\| $c1 == 0xff) {
96	$c2 = ord (getc ($handle)) if (!eof ($handle));
97
98	# if unicode fall through to the unicode reading code
99	if ($c1 == 0xff && $c2 == 0xfe) {
100	$self->{'encoding'} = "unicode";
101	$self->{'bigendian'} = 0;
102	if ($ENV{'GSDLOS'} =~ /windows/i) {
103	binmode ($handle); # silly windows
104	}
105	last;
106
107	} elsif ($c1 == 0xfe && $c2 == 0xff) {
108	$self->{'encoding'} = "unicode";
109	$self->{'bigendian'} = 1;
110	if ($ENV{'GSDLOS'} =~ /windows/i) {
111	binmode ($handle); # silly windows
112	}
113	last;
114	}
115
116	# an error, but we might be able to recover
117	# from it
118	$c1 = $c2;
119	}
120	}
121
122	if ($c1 <= 0x7f) {
123	# one byte character
124	return chr ($c1);
125
126	} elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
127	# two byte character
128	$c2 = getc ($handle) if (!eof ($handle));
129	return chr ($c1) . $c2;
130
131	} elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
132	# three byte character
133	$c2 = getc ($handle) if (!eof ($handle));
134	$c3 = getc ($handle) if (!eof ($handle));
135	return chr ($c1) . $c2 . $c3;
136	}
137
138	# if we get here there was an error in the file, we should
139	# be able to recover from it however, maybe the file is in
140	# another encoding
141	}
142
143	return undef if (eof ($handle));
144	}
145
146	if ($self->{'encoding'} eq "unicode") {
147	# unicode text, get the next two characters
148	return undef if (eof ($handle));
149	my $c1 = ord (getc ($handle));
150	return undef if (eof ($handle));
151	my $c2 = ord (getc ($handle));
152
153	return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
154	}
155
156	if ($self->{'encoding'} eq "gb") {
157	# GB or GBK
158	return undef if (eof ($handle));
159	my $c1 = getc ($handle);
160	if (ord ($c1) >= 0x81) {
161	# double byte character
162	return undef if (eof ($handle));
163	my $c2 = getc ($handle);
164	return &unicode::unicode2utf8 (&gb::gb2unicode ($c1.$c2));
165
166	} else {
167	# single byte character
168	return &unicode::ascii2utf8 ($c1);
169	}
170	}
171
172	if ($self->{'encoding'} eq "iso_8859_1") {
173	# Latin 1 extended ascii (ISO-8859-1)
174	return undef if (eof ($handle));
175	return &unicode::ascii2utf8 (getc ($handle));
176	}
177
178	if ($self->{'encoding'} eq "iso_8859_6") {
179	# 8 bit Arabic (IOS-8859-6)
180	return undef if (eof ($handle));
181	return &unicode::unicode2utf8(&unicode::arabic2unicode (getc ($handle)));
182	}
183
184	if ($self->{'encoding'} eq "windows_1256") {
185	# Windows 1256 (Arabic)
186	return undef if (eof ($handle));
187	return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", getc ($handle)));
188	}
189
190	# unknown encoding
191	return undef;
192	}
193
194
195	# undef will be returned if the eof has been reached
196	# the result will always be returned in utf-8
197	sub read_line {
198	my $self = shift (@_);
199
200	# make sure we have a file handle
201	return undef if ($self->{'handle'} eq "");
202
203	my $handle = $self->{'handle'};
204
205	if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
206	# special case for the first line of utf8 text to detect whether
207	# the file is in utf8 or unicode
208	my $out = "";
209	my $thisc = "";
210	while (defined ($thisc = $self->read_char())) {
211	$out .= $thisc;
212	last if ($thisc eq "\n");
213	}
214
215	return $out if (length ($out) > 0);
216	return undef;
217	}
218
219
220	if ($self->{'encoding'} eq "utf8") {
221	# utf-8 line
222	return <$handle>;
223	}
224
225	if ($self->{'encoding'} eq "unicode") {
226	# unicode line
227	my $c = "";
228	my ($c1, $c2) = ("", "");
229	my $out = "";
230	while (read ($handle, $c, 2) == 2) {
231	$c1 = ord (substr ($c, 0, 1));
232	$c2 = ord (substr ($c, 1, 1));
233	$c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
234	$out .= $c;
235	last if ($c eq "\n");
236	}
237
238	return $out if (length ($out) > 0);
239	return undef;
240	}
241
242	if ($self->{'encoding'} eq "gb") {
243	# GB or GBK
244	my $line = "";
245	if (defined ($line = <$handle>)) {
246	return &unicode::unicode2utf8 (&gb::gb2unicode ($line));
247	}
248	return undef;
249	}
250
251	if ($self->{'encoding'} eq "iso_8859_1") {
252	# extended ascii (ISO-8859-1)
253	my $line = "";
254	if (defined ($line = <$handle>)) {
255	return &unicode::ascii2utf8 ($line);
256	}
257	return undef;
258	}
259
260	if ($self->{'encoding'} eq "iso_8859_6") {
261	# 8 bit arabic (ISO-8859-6)
262	my $line = "";
263	if (defined ($line = <$handle>)) {
264	return &unicode::unicode2utf8(&unicode::arabic2unicode ($line));
265	}
266	return undef;
267	}
268
269	if ($self->{'encoding'} eq "windows_1256") {
270	# Windows 1256 (Arabic)
271	my $line = "";
272	if (defined ($line = <$handle>)) {
273	return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $line));
274	}
275	return undef;
276	}
277
278	# unknown encoding
279	return undef;
280	}
281
282
283	# will convert entire contents of file to utf8 and append result to $outputref
284	# this may be a slightly faster way to get the contents of a file than by
285	# recursively calling read_line()
286	sub read_file {
287	my $self = shift (@_);
288	my ($outputref) = @_;
289
290	# make sure we have a file handle
291	return if ($self->{'handle'} eq "");
292
293	my $handle = $self->{'handle'};
294
295	if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
296	# special case for the first line of utf8 text to detect whether
297	# the file is in utf8 or unicode
298	$$text .= $self->read_line ();
299	}
300
301	if ($self->{'encoding'} eq "utf8") {
302	undef $/;
303	$$outputref .= <$handle>;
304	$/ = "\n";
305	return;
306	}
307
308	if ($self->{'encoding'} eq "unicode") {
309	my $line = "";
310	while (defined ($line = $self->read_line())) {
311	$$outputref .= $line;
312	}
313	return;
314	}
315
316	if ($self->{'encoding'} eq "gb") {
317	undef $/;
318	my $text = <$handle>;
319	$/ = "\n";
320	$$outputref .= &unicode::unicode2utf8 (&gb::gb2unicode ($text));
321	return;
322	}
323
324	if ($self->{'encoding'} eq "iso_8859_1") {
325	undef $/;
326	my $text = <$handle>;
327	$/ = "\n";
328	$$outputref .= &unicode::ascii2utf8 ($text);
329	return;
330	}
331
332	if ($self->{'encoding'} eq "iso_8859_6") {
333	my $text = <$handle>;
334	undef $/;
335	$/ = "\n";
336	$$outputref .= &unicode::unicode2utf8(&unicode::arabic2unicode ($text));
337	return;
338	}
339
340	if ($self->{'encoding'} eq "windows_1256") {
341	undef $/;
342	my $text = <$handle>;
343	$/ = "\n";
344	$$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $text));
345	return;
346	}
347	}
348
349
350	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: