Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: main/tags/2.20/gsdl/perllib/multiread.pm@ 24171

Last change on this file since 24171 was 627, checked in by rjmcnab, 25 years ago
initial revision.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.1 KB

Line
1	###########################################################################
2	#
3	# multiread.pm --
4	#
5	# Copyright (C) 1999 DigiLib Systems Limited, NZ
6	#
7	# This program is free software; you can redistribute it and/or modify
8	# it under the terms of the GNU General Public License as published by
9	# the Free Software Foundation; either version 2 of the License, or
10	# (at your option) any later version.
11	#
12	# This program is distributed in the hope that it will be useful,
13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	# GNU General Public License for more details.
16	#
17	# You should have received a copy of the GNU General Public License
18	# along with this program; if not, write to the Free Software
19	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20	#
21	###########################################################################
22
23	# the multiread object will read in a number of encodings,
24	# the results are always returned in the utf-8 format
25
26	# encodings currently supported are
27	#
28	# utf8 - either utf8 or unicode (automatically detected)
29	# unicode - just unicode (doesn't currently do endian detection)
30	# gb - GB
31	# extended - extended ascii
32
33
34	package multiread;
35
36	use unicode;
37	use gb;
38
39	sub new {
40	my ($class) = @_;
41
42	my $self = {'handle' => "",
43	'first' => 1,
44	'encoding' => "utf8",
45	'bigendian' => 1};
46
47	return bless $self, $class;
48	}
49
50	# set_handle expects the file to be already open but
51	# not read yet
52	sub set_handle {
53	my $self = shift (@_);
54	($self->{'handle'}) = @_;
55	$self->{'first'} = 1;
56	$self->{'encoding'} = "utf8";
57	$self->{'bigendian'} = 1;
58	}
59
60	# set_encoding should be called after set_handle
61	sub set_encoding {
62	my $self = shift (@_);
63	($self->{'encoding'}) = @_;
64	}
65
66	sub get_encoding {
67	my $self = shift (@_);
68	return $self->{'encoding'};
69	}
70
71	# undef will be returned if the eof has been reached
72	# the result will always be returned in utf-8
73	# if automatic detection between utf8 and unicode is desired
74	# then the encoding should be initially set to utf8
75	sub read_char {
76	my $self = shift (@_);
77
78	# make sure we have a file handle
79	return undef if ($self->{'handle'} eq "");
80	my $handle = $self->{'handle'};
81
82	if ($self->{'encoding'} eq "utf8") {
83	# utf-8 text, how many characters we get depends
84	# on what we find
85	my $c1 = "";
86	my $c2 = "";
87	my $c3 = "";
88
89	while (!eof ($handle)) {
90	$c1 = ord (getc ($handle));
91	if ($self->{'first'}) {
92	$self->{'first'} = 0;
93
94	if ($c1 == 0xfe \|\| $c1 == 0xff) {
95	$c2 = ord (getc ($handle)) if (!eof ($handle));
96
97	# if unicode fall through to the unicode reading code
98	if ($c1 == 0xff && $c2 == 0xfe) {
99	$self->{'encoding'} = "unicode";
100	$self->{'bigendian'} = 0;
101	if ($ENV{'GSDLOS'} =~ /windows/i) {
102	binmode ($handle); # silly windows
103	}
104	last;
105
106	} elsif ($c1 == 0xfe && $c2 == 0xff) {
107	$self->{'encoding'} = "unicode";
108	$self->{'bigendian'} = 1;
109	if ($ENV{'GSDLOS'} =~ /windows/i) {
110	binmode ($handle); # silly windows
111	}
112	last;
113	}
114
115	# an error, but we might be able to recover
116	# from it
117	$c1 = $c2;
118	}
119	}
120
121	if ($c1 <= 0x7f) {
122	# one byte character
123	return chr ($c1);
124
125	} elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
126	# two byte character
127	$c2 = getc ($handle) if (!eof ($handle));
128	return chr ($c1) . $c2;
129
130	} elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
131	# three byte character
132	$c2 = getc ($handle) if (!eof ($handle));
133	$c3 = getc ($handle) if (!eof ($handle));
134	return chr ($c1) . $c2 . $c3;
135	}
136
137	# if we get here there was an error in the file, we should
138	# be able to recover from it however, maybe the file is in
139	# another encoding
140	}
141
142	return undef if (eof ($handle));
143	}
144
145	if ($self->{'encoding'} eq "unicode") {
146	# unicode text, get the next two characters
147	return undef if (eof ($handle));
148	my $c1 = ord (getc ($handle));
149	return undef if (eof ($handle));
150	my $c2 = ord (getc ($handle));
151
152	return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
153	}
154
155	if ($self->{'encoding'} eq "gb") {
156	# GB or GBK
157	return undef if (eof ($handle));
158	my $c1 = getc ($handle);
159	if (ord ($c1) >= 0x81) {
160	# double byte character
161	return undef if (eof ($handle));
162	my $c2 = getc ($handle);
163	return &unicode::unicode2utf8 (&gb::gb2unicode ($c1.$c2));
164
165	} else {
166	# single byte character
167	return &unicode::ascii2utf8 ($c1);
168	}
169	}
170
171	if ($self->{'encoding'} eq "extended") {
172	# extended ascii
173	return undef if (eof ($handle));
174	return &unicode::ascii2utf8 (getc ($handle));
175	}
176
177	# unknown encoding
178	return undef;
179	}
180
181
182	# undef will be returned if the eof has been reached
183	# the result will always be returned in utf-8
184	sub read_line {
185	my $self = shift (@_);
186
187	# make sure we have a file handle
188	return undef if ($self->{'handle'} eq "");
189
190	my $handle = $self->{'handle'};
191
192	if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
193	# special case for the first line of utf8 text to detect whether
194	# the file is in utf8 or unicode
195	my $out = "";
196	my $thisc = "";
197	while (defined ($thisc = $self->read_char())) {
198	$out .= $thisc;
199	last if ($thisc eq "\n");
200	}
201
202	return $out if (length ($out) > 0);
203	return undef;
204	}
205
206
207	if ($self->{'encoding'} eq "utf8") {
208	# utf-8 line
209	return <$handle>;
210	}
211
212	if ($self->{'encoding'} eq "unicode") {
213	# unicode line
214	my $c = "";
215	my ($c1, $c2) = ("", "");
216	my $out = "";
217	while (read ($handle, $c, 2) == 2) {
218	$c1 = ord (substr ($c, 0, 1));
219	$c2 = ord (substr ($c, 1, 1));
220	$c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
221	$out .= $c;
222	last if ($c eq "\n");
223	}
224
225	return $out if (length ($out) > 0);
226	return undef;
227	}
228
229	if ($self->{'encoding'} eq "gb") {
230	# GB or GBK
231	my $line = "";
232	if (defined ($line = <$handle>)) {
233	return &unicode::unicode2utf8 (&gb::gb2unicode ($line));
234	}
235	return undef;
236	}
237
238	if ($self->{'encoding'} eq "extended") {
239	# extended ascii
240	my $line = "";
241	if (defined ($line = <$handle>)) {
242	return &unicode::ascii2utf8 ($line);
243	}
244	return undef;
245	}
246
247	# unknown encoding
248	return undef;
249	}
250
251
252	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: