source: trunk/gsdl/perllib/multiread.pm@ 627

Last change on this file since 627 was 627, checked in by rjmcnab, 25 years ago

initial revision.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.1 KB
Line 
1###########################################################################
2#
3# multiread.pm --
4#
5# Copyright (C) 1999 DigiLib Systems Limited, NZ
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20#
21###########################################################################
22
23# the multiread object will read in a number of encodings,
24# the results are always returned in the utf-8 format
25
26# encodings currently supported are
27#
28# utf8 - either utf8 or unicode (automatically detected)
29# unicode - just unicode (doesn't currently do endian detection)
30# gb - GB
31# extended - extended ascii
32
33
34package multiread;
35
36use unicode;
37use gb;
38
39sub new {
40 my ($class) = @_;
41
42 my $self = {'handle' => "",
43 'first' => 1,
44 'encoding' => "utf8",
45 'bigendian' => 1};
46
47 return bless $self, $class;
48}
49
50# set_handle expects the file to be already open but
51# not read yet
52sub set_handle {
53 my $self = shift (@_);
54 ($self->{'handle'}) = @_;
55 $self->{'first'} = 1;
56 $self->{'encoding'} = "utf8";
57 $self->{'bigendian'} = 1;
58}
59
60# set_encoding should be called after set_handle
61sub set_encoding {
62 my $self = shift (@_);
63 ($self->{'encoding'}) = @_;
64}
65
66sub get_encoding {
67 my $self = shift (@_);
68 return $self->{'encoding'};
69}
70
71# undef will be returned if the eof has been reached
72# the result will always be returned in utf-8
73# if automatic detection between utf8 and unicode is desired
74# then the encoding should be initially set to utf8
75sub read_char {
76 my $self = shift (@_);
77
78 # make sure we have a file handle
79 return undef if ($self->{'handle'} eq "");
80 my $handle = $self->{'handle'};
81
82 if ($self->{'encoding'} eq "utf8") {
83 # utf-8 text, how many characters we get depends
84 # on what we find
85 my $c1 = "";
86 my $c2 = "";
87 my $c3 = "";
88
89 while (!eof ($handle)) {
90 $c1 = ord (getc ($handle));
91 if ($self->{'first'}) {
92 $self->{'first'} = 0;
93
94 if ($c1 == 0xfe || $c1 == 0xff) {
95 $c2 = ord (getc ($handle)) if (!eof ($handle));
96
97 # if unicode fall through to the unicode reading code
98 if ($c1 == 0xff && $c2 == 0xfe) {
99 $self->{'encoding'} = "unicode";
100 $self->{'bigendian'} = 0;
101 if ($ENV{'GSDLOS'} =~ /windows/i) {
102 binmode ($handle); # silly windows
103 }
104 last;
105
106 } elsif ($c1 == 0xfe && $c2 == 0xff) {
107 $self->{'encoding'} = "unicode";
108 $self->{'bigendian'} = 1;
109 if ($ENV{'GSDLOS'} =~ /windows/i) {
110 binmode ($handle); # silly windows
111 }
112 last;
113 }
114
115 # an error, but we might be able to recover
116 # from it
117 $c1 = $c2;
118 }
119 }
120
121 if ($c1 <= 0x7f) {
122 # one byte character
123 return chr ($c1);
124
125 } elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
126 # two byte character
127 $c2 = getc ($handle) if (!eof ($handle));
128 return chr ($c1) . $c2;
129
130 } elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
131 # three byte character
132 $c2 = getc ($handle) if (!eof ($handle));
133 $c3 = getc ($handle) if (!eof ($handle));
134 return chr ($c1) . $c2 . $c3;
135 }
136
137 # if we get here there was an error in the file, we should
138 # be able to recover from it however, maybe the file is in
139 # another encoding
140 }
141
142 return undef if (eof ($handle));
143 }
144
145 if ($self->{'encoding'} eq "unicode") {
146 # unicode text, get the next two characters
147 return undef if (eof ($handle));
148 my $c1 = ord (getc ($handle));
149 return undef if (eof ($handle));
150 my $c2 = ord (getc ($handle));
151
152 return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
153 }
154
155 if ($self->{'encoding'} eq "gb") {
156 # GB or GBK
157 return undef if (eof ($handle));
158 my $c1 = getc ($handle);
159 if (ord ($c1) >= 0x81) {
160 # double byte character
161 return undef if (eof ($handle));
162 my $c2 = getc ($handle);
163 return &unicode::unicode2utf8 (&gb::gb2unicode ($c1.$c2));
164
165 } else {
166 # single byte character
167 return &unicode::ascii2utf8 ($c1);
168 }
169 }
170
171 if ($self->{'encoding'} eq "extended") {
172 # extended ascii
173 return undef if (eof ($handle));
174 return &unicode::ascii2utf8 (getc ($handle));
175 }
176
177 # unknown encoding
178 return undef;
179}
180
181
182# undef will be returned if the eof has been reached
183# the result will always be returned in utf-8
184sub read_line {
185 my $self = shift (@_);
186
187 # make sure we have a file handle
188 return undef if ($self->{'handle'} eq "");
189
190 my $handle = $self->{'handle'};
191
192 if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
193 # special case for the first line of utf8 text to detect whether
194 # the file is in utf8 or unicode
195 my $out = "";
196 my $thisc = "";
197 while (defined ($thisc = $self->read_char())) {
198 $out .= $thisc;
199 last if ($thisc eq "\n");
200 }
201
202 return $out if (length ($out) > 0);
203 return undef;
204 }
205
206
207 if ($self->{'encoding'} eq "utf8") {
208 # utf-8 line
209 return <$handle>;
210 }
211
212 if ($self->{'encoding'} eq "unicode") {
213 # unicode line
214 my $c = "";
215 my ($c1, $c2) = ("", "");
216 my $out = "";
217 while (read ($handle, $c, 2) == 2) {
218 $c1 = ord (substr ($c, 0, 1));
219 $c2 = ord (substr ($c, 1, 1));
220 $c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
221 $out .= $c;
222 last if ($c eq "\n");
223 }
224
225 return $out if (length ($out) > 0);
226 return undef;
227 }
228
229 if ($self->{'encoding'} eq "gb") {
230 # GB or GBK
231 my $line = "";
232 if (defined ($line = <$handle>)) {
233 return &unicode::unicode2utf8 (&gb::gb2unicode ($line));
234 }
235 return undef;
236 }
237
238 if ($self->{'encoding'} eq "extended") {
239 # extended ascii
240 my $line = "";
241 if (defined ($line = <$handle>)) {
242 return &unicode::ascii2utf8 ($line);
243 }
244 return undef;
245 }
246
247 # unknown encoding
248 return undef;
249}
250
251
2521;
Note: See TracBrowser for help on using the repository browser.