source: branches/New_Config_Format-branch/gsdl/perllib/multiread.pm@ 1279

Last change on this file since 1279 was 1279, checked in by sjboddie, 24 years ago

merged changes to trunk into New_Config_Format branch

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.6 KB
Line 
1###########################################################################
2#
3# multiread.pm --
4#
5# Copyright (C) 1999 DigiLib Systems Limited, NZ
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20#
21###########################################################################
22
23# the multiread object will read in a number of encodings,
24# the results are always returned in the utf-8 format
25
26# encodings currently supported are
27#
28# utf8 - either utf8 or unicode (automatically detected)
29# unicode - just unicode (doesn't currently do endian detection)
30# gb - GB
31# iso_8859_1 - extended ascii (iso-8859-1)
32# iso_8859_6 - 8 bit arabic (iso-8859-6)
33# windows_1256 - Windows codepage 1256 (Arabic)
34
35package multiread;
36
37use unicode;
38use gb;
39
40sub new {
41 my ($class) = @_;
42
43 my $self = {'handle' => "",
44 'first' => 1,
45 'encoding' => "utf8",
46 'bigendian' => 1};
47
48 return bless $self, $class;
49}
50
51# set_handle expects the file to be already open but
52# not read yet
53sub set_handle {
54 my $self = shift (@_);
55 ($self->{'handle'}) = @_;
56 $self->{'first'} = 1;
57 $self->{'encoding'} = "utf8";
58 $self->{'bigendian'} = 1;
59}
60
61# set_encoding should be called after set_handle
62sub set_encoding {
63 my $self = shift (@_);
64 ($self->{'encoding'}) = @_;
65}
66
67sub get_encoding {
68 my $self = shift (@_);
69 return $self->{'encoding'};
70}
71
72# undef will be returned if the eof has been reached
73# the result will always be returned in utf-8
74# if automatic detection between utf8 and unicode is desired
75# then the encoding should be initially set to utf8
76sub read_char {
77 my $self = shift (@_);
78
79 # make sure we have a file handle
80 return undef if ($self->{'handle'} eq "");
81 my $handle = $self->{'handle'};
82
83 if ($self->{'encoding'} eq "utf8") {
84 # utf-8 text, how many characters we get depends
85 # on what we find
86 my $c1 = "";
87 my $c2 = "";
88 my $c3 = "";
89
90 while (!eof ($handle)) {
91 $c1 = ord (getc ($handle));
92 if ($self->{'first'}) {
93 $self->{'first'} = 0;
94
95 if ($c1 == 0xfe || $c1 == 0xff) {
96 $c2 = ord (getc ($handle)) if (!eof ($handle));
97
98 # if unicode fall through to the unicode reading code
99 if ($c1 == 0xff && $c2 == 0xfe) {
100 $self->{'encoding'} = "unicode";
101 $self->{'bigendian'} = 0;
102 if ($ENV{'GSDLOS'} =~ /windows/i) {
103 binmode ($handle); # silly windows
104 }
105 last;
106
107 } elsif ($c1 == 0xfe && $c2 == 0xff) {
108 $self->{'encoding'} = "unicode";
109 $self->{'bigendian'} = 1;
110 if ($ENV{'GSDLOS'} =~ /windows/i) {
111 binmode ($handle); # silly windows
112 }
113 last;
114 }
115
116 # an error, but we might be able to recover
117 # from it
118 $c1 = $c2;
119 }
120 }
121
122 if ($c1 <= 0x7f) {
123 # one byte character
124 return chr ($c1);
125
126 } elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
127 # two byte character
128 $c2 = getc ($handle) if (!eof ($handle));
129 return chr ($c1) . $c2;
130
131 } elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
132 # three byte character
133 $c2 = getc ($handle) if (!eof ($handle));
134 $c3 = getc ($handle) if (!eof ($handle));
135 return chr ($c1) . $c2 . $c3;
136 }
137
138 # if we get here there was an error in the file, we should
139 # be able to recover from it however, maybe the file is in
140 # another encoding
141 }
142
143 return undef if (eof ($handle));
144 }
145
146 if ($self->{'encoding'} eq "unicode") {
147 # unicode text, get the next two characters
148 return undef if (eof ($handle));
149 my $c1 = ord (getc ($handle));
150 return undef if (eof ($handle));
151 my $c2 = ord (getc ($handle));
152
153 return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
154 }
155
156 if ($self->{'encoding'} eq "gb") {
157 # GB or GBK
158 return undef if (eof ($handle));
159 my $c1 = getc ($handle);
160 if (ord ($c1) >= 0x81) {
161 # double byte character
162 return undef if (eof ($handle));
163 my $c2 = getc ($handle);
164 return &unicode::unicode2utf8 (&gb::gb2unicode ($c1.$c2));
165
166 } else {
167 # single byte character
168 return &unicode::ascii2utf8 ($c1);
169 }
170 }
171
172 if ($self->{'encoding'} eq "iso_8859_1") {
173 # Latin 1 extended ascii (ISO-8859-1)
174 return undef if (eof ($handle));
175 return &unicode::ascii2utf8 (getc ($handle));
176 }
177
178 if ($self->{'encoding'} eq "iso_8859_6") {
179 # 8 bit Arabic (IOS-8859-6)
180 return undef if (eof ($handle));
181 return &unicode::unicode2utf8(&unicode::arabic2unicode (getc ($handle)));
182 }
183
184 if ($self->{'encoding'} eq "windows_1256") {
185 # Windows 1256 (Arabic)
186 return undef if (eof ($handle));
187 return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", getc ($handle)));
188 }
189
190 # unknown encoding
191 return undef;
192}
193
194
195# undef will be returned if the eof has been reached
196# the result will always be returned in utf-8
197sub read_line {
198 my $self = shift (@_);
199
200 # make sure we have a file handle
201 return undef if ($self->{'handle'} eq "");
202
203 my $handle = $self->{'handle'};
204
205 if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
206 # special case for the first line of utf8 text to detect whether
207 # the file is in utf8 or unicode
208 my $out = "";
209 my $thisc = "";
210 while (defined ($thisc = $self->read_char())) {
211 $out .= $thisc;
212 last if ($thisc eq "\n");
213 }
214
215 return $out if (length ($out) > 0);
216 return undef;
217 }
218
219
220 if ($self->{'encoding'} eq "utf8") {
221 # utf-8 line
222 return <$handle>;
223 }
224
225 if ($self->{'encoding'} eq "unicode") {
226 # unicode line
227 my $c = "";
228 my ($c1, $c2) = ("", "");
229 my $out = "";
230 while (read ($handle, $c, 2) == 2) {
231 $c1 = ord (substr ($c, 0, 1));
232 $c2 = ord (substr ($c, 1, 1));
233 $c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
234 $out .= $c;
235 last if ($c eq "\n");
236 }
237
238 return $out if (length ($out) > 0);
239 return undef;
240 }
241
242 if ($self->{'encoding'} eq "gb") {
243 # GB or GBK
244 my $line = "";
245 if (defined ($line = <$handle>)) {
246 return &unicode::unicode2utf8 (&gb::gb2unicode ($line));
247 }
248 return undef;
249 }
250
251 if ($self->{'encoding'} eq "iso_8859_1") {
252 # extended ascii (ISO-8859-1)
253 my $line = "";
254 if (defined ($line = <$handle>)) {
255 return &unicode::ascii2utf8 ($line);
256 }
257 return undef;
258 }
259
260 if ($self->{'encoding'} eq "iso_8859_6") {
261 # 8 bit arabic (ISO-8859-6)
262 my $line = "";
263 if (defined ($line = <$handle>)) {
264 return &unicode::unicode2utf8(&unicode::arabic2unicode ($line));
265 }
266 return undef;
267 }
268
269 if ($self->{'encoding'} eq "windows_1256") {
270 # Windows 1256 (Arabic)
271 my $line = "";
272 if (defined ($line = <$handle>)) {
273 return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $line));
274 }
275 return undef;
276 }
277
278 # unknown encoding
279 return undef;
280}
281
282
283# will convert entire contents of file to utf8 and append result to $outputref
284# this may be a slightly faster way to get the contents of a file than by
285# recursively calling read_line()
286sub read_file {
287 my $self = shift (@_);
288 my ($outputref) = @_;
289
290 # make sure we have a file handle
291 return if ($self->{'handle'} eq "");
292
293 my $handle = $self->{'handle'};
294
295 if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
296 # special case for the first line of utf8 text to detect whether
297 # the file is in utf8 or unicode
298 $$text .= $self->read_line ();
299 }
300
301 if ($self->{'encoding'} eq "utf8") {
302 undef $/;
303 $$outputref .= <$handle>;
304 $/ = "\n";
305 return;
306 }
307
308 if ($self->{'encoding'} eq "unicode") {
309 my $line = "";
310 while (defined ($line = $self->read_line())) {
311 $$outputref .= $line;
312 }
313 return;
314 }
315
316 if ($self->{'encoding'} eq "gb") {
317 undef $/;
318 my $text = <$handle>;
319 $/ = "\n";
320 $$outputref .= &unicode::unicode2utf8 (&gb::gb2unicode ($text));
321 return;
322 }
323
324 if ($self->{'encoding'} eq "iso_8859_1") {
325 undef $/;
326 my $text = <$handle>;
327 $/ = "\n";
328 $$outputref .= &unicode::ascii2utf8 ($text);
329 return;
330 }
331
332 if ($self->{'encoding'} eq "iso_8859_6") {
333 my $text = <$handle>;
334 undef $/;
335 $/ = "\n";
336 $$outputref .= &unicode::unicode2utf8(&unicode::arabic2unicode ($text));
337 return;
338 }
339
340 if ($self->{'encoding'} eq "windows_1256") {
341 undef $/;
342 my $text = <$handle>;
343 $/ = "\n";
344 $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $text));
345 return;
346 }
347}
348
349
3501;
Note: See TracBrowser for help on using the repository browser.