source: trunk/gsdl/perllib/multiread.pm@ 1838

Last change on this file since 1838 was 1838, checked in by sjboddie, 23 years ago

Added support for Cyrillic languages (windows codepage 1251) - yet to be
tested by anyone who understands any such language though ;)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.3 KB
Line 
1###########################################################################
2#
3# multiread.pm --
4#
5# Copyright (C) 1999 DigiLib Systems Limited, NZ
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20#
21###########################################################################
22
23# the multiread object will read in a number of encodings,
24# the results are always returned in the utf-8 format
25
26# encodings currently supported are
27#
28# utf8 - either utf8 or unicode (automatically detected)
29# unicode - just unicode (doesn't currently do endian detection)
30# gb - GB
31# iso_8859_1 - extended ascii (iso-8859-1)
32# iso_8859_6 - 8 bit arabic (iso-8859-6)
33# windows_1256 - Windows codepage 1256 (Arabic)
34# windows_1251 - Windows codepage 1251 (Cyrillic)
35
36package multiread;
37
38use unicode;
39use gb;
40
41sub new {
42 my ($class) = @_;
43
44 my $self = {'handle' => "",
45 'first' => 1,
46 'encoding' => "utf8",
47 'bigendian' => 1};
48
49 return bless $self, $class;
50}
51
52# set_handle expects the file to be already open but
53# not read yet
54sub set_handle {
55 my $self = shift (@_);
56 ($self->{'handle'}) = @_;
57 $self->{'first'} = 1;
58 $self->{'encoding'} = "utf8";
59 $self->{'bigendian'} = 1;
60}
61
62# set_encoding should be called after set_handle
63sub set_encoding {
64 my $self = shift (@_);
65 ($self->{'encoding'}) = @_;
66}
67
68sub get_encoding {
69 my $self = shift (@_);
70 return $self->{'encoding'};
71}
72
73# undef will be returned if the eof has been reached
74# the result will always be returned in utf-8
75# if automatic detection between utf8 and unicode is desired
76# then the encoding should be initially set to utf8
77sub read_char {
78 my $self = shift (@_);
79
80 # make sure we have a file handle
81 return undef if ($self->{'handle'} eq "");
82 my $handle = $self->{'handle'};
83
84 if ($self->{'encoding'} eq "utf8") {
85 # utf-8 text, how many characters we get depends
86 # on what we find
87 my $c1 = "";
88 my $c2 = "";
89 my $c3 = "";
90
91 while (!eof ($handle)) {
92 $c1 = ord (getc ($handle));
93 if ($self->{'first'}) {
94 $self->{'first'} = 0;
95
96 if ($c1 == 0xfe || $c1 == 0xff) {
97 $c2 = ord (getc ($handle)) if (!eof ($handle));
98
99 # if unicode fall through to the unicode reading code
100 if ($c1 == 0xff && $c2 == 0xfe) {
101 $self->{'encoding'} = "unicode";
102 $self->{'bigendian'} = 0;
103 if ($ENV{'GSDLOS'} =~ /windows/i) {
104 binmode ($handle); # silly windows
105 }
106 last;
107
108 } elsif ($c1 == 0xfe && $c2 == 0xff) {
109 $self->{'encoding'} = "unicode";
110 $self->{'bigendian'} = 1;
111 if ($ENV{'GSDLOS'} =~ /windows/i) {
112 binmode ($handle); # silly windows
113 }
114 last;
115 }
116
117 # an error, but we might be able to recover
118 # from it
119 $c1 = $c2;
120 }
121 }
122
123 if ($c1 <= 0x7f) {
124 # one byte character
125 return chr ($c1);
126
127 } elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
128 # two byte character
129 $c2 = getc ($handle) if (!eof ($handle));
130 return chr ($c1) . $c2;
131
132 } elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
133 # three byte character
134 $c2 = getc ($handle) if (!eof ($handle));
135 $c3 = getc ($handle) if (!eof ($handle));
136 return chr ($c1) . $c2 . $c3;
137 }
138
139 # if we get here there was an error in the file, we should
140 # be able to recover from it however, maybe the file is in
141 # another encoding
142 }
143
144 return undef if (eof ($handle));
145 }
146
147 if ($self->{'encoding'} eq "unicode") {
148 # unicode text, get the next two characters
149 return undef if (eof ($handle));
150 my $c1 = ord (getc ($handle));
151 return undef if (eof ($handle));
152 my $c2 = ord (getc ($handle));
153
154 return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
155 }
156
157 if ($self->{'encoding'} eq "gb") {
158 # GB or GBK
159 return undef if (eof ($handle));
160 my $c1 = getc ($handle);
161 if (ord ($c1) >= 0x81) {
162 # double byte character
163 return undef if (eof ($handle));
164 my $c2 = getc ($handle);
165 return &unicode::unicode2utf8 (&gb::gb2unicode ($c1.$c2));
166
167 } else {
168 # single byte character
169 return &unicode::ascii2utf8 ($c1);
170 }
171 }
172
173 if ($self->{'encoding'} eq "iso_8859_1") {
174 # Latin 1 extended ascii (ISO-8859-1)
175 return undef if (eof ($handle));
176 return &unicode::ascii2utf8 (getc ($handle));
177 }
178
179 if ($self->{'encoding'} eq "iso_8859_6") {
180 # 8 bit Arabic (IOS-8859-6)
181 return undef if (eof ($handle));
182 return &unicode::unicode2utf8(&unicode::arabic2unicode (getc ($handle)));
183 }
184
185 if ($self->{'encoding'} eq "windows_1256") {
186 # Windows 1256 (Arabic)
187 return undef if (eof ($handle));
188 return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", getc ($handle)));
189 }
190
191 if ($self->{'encoding'} eq "windows_1251") {
192 # Windows 1251 (Cyrillic)
193 return undef if (eof ($handle));
194 return &unicode::unicode2utf8(&unicode::windows2unicode ("1251", getc ($handle)));
195 }
196
197 # unknown encoding
198 return undef;
199}
200
201
202# undef will be returned if the eof has been reached
203# the result will always be returned in utf-8
204sub read_line {
205 my $self = shift (@_);
206
207 # make sure we have a file handle
208 return undef if ($self->{'handle'} eq "");
209
210 my $handle = $self->{'handle'};
211
212 if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
213 # special case for the first line of utf8 text to detect whether
214 # the file is in utf8 or unicode
215 my $out = "";
216 my $thisc = "";
217 while (defined ($thisc = $self->read_char())) {
218 $out .= $thisc;
219 last if ($thisc eq "\n");
220 }
221
222 return $out if (length ($out) > 0);
223 return undef;
224 }
225
226
227 if ($self->{'encoding'} eq "utf8") {
228 # utf-8 line
229 return <$handle>;
230 }
231
232 if ($self->{'encoding'} eq "unicode") {
233 # unicode line
234 my $c = "";
235 my ($c1, $c2) = ("", "");
236 my $out = "";
237 while (read ($handle, $c, 2) == 2) {
238 $c1 = ord (substr ($c, 0, 1));
239 $c2 = ord (substr ($c, 1, 1));
240 $c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
241 $out .= $c;
242 last if ($c eq "\n");
243 }
244
245 return $out if (length ($out) > 0);
246 return undef;
247 }
248
249 if ($self->{'encoding'} eq "gb") {
250 # GB or GBK
251 my $line = "";
252 if (defined ($line = <$handle>)) {
253 return &unicode::unicode2utf8 (&gb::gb2unicode ($line));
254 }
255 return undef;
256 }
257
258 if ($self->{'encoding'} eq "iso_8859_1") {
259 # extended ascii (ISO-8859-1)
260 my $line = "";
261 if (defined ($line = <$handle>)) {
262 return &unicode::ascii2utf8 ($line);
263 }
264 return undef;
265 }
266
267 if ($self->{'encoding'} eq "iso_8859_6") {
268 # 8 bit arabic (ISO-8859-6)
269 my $line = "";
270 if (defined ($line = <$handle>)) {
271 return &unicode::unicode2utf8(&unicode::arabic2unicode ($line));
272 }
273 return undef;
274 }
275
276 if ($self->{'encoding'} eq "windows_1256") {
277 # Windows 1256 (Arabic)
278 my $line = "";
279 if (defined ($line = <$handle>)) {
280 return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $line));
281 }
282 return undef;
283 }
284
285 if ($self->{'encoding'} eq "windows_1251") {
286 # Windows 1251 (Cyrillic)
287 my $line = "";
288 if (defined ($line = <$handle>)) {
289 return &unicode::unicode2utf8(&unicode::windows2unicode ("1251", $line));
290 }
291 return undef;
292 }
293
294 # unknown encoding
295 return undef;
296}
297
298
299# will convert entire contents of file to utf8 and append result to $outputref
300# this may be a slightly faster way to get the contents of a file than by
301# recursively calling read_line()
302sub read_file {
303 my $self = shift (@_);
304 my ($outputref) = @_;
305
306 # make sure we have a file handle
307 return if ($self->{'handle'} eq "");
308
309 my $handle = $self->{'handle'};
310
311 if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
312 # special case for the first line of utf8 text to detect whether
313 # the file is in utf8 or unicode
314 $$text .= $self->read_line ();
315 }
316
317 if ($self->{'encoding'} eq "utf8") {
318 undef $/;
319 $$outputref .= <$handle>;
320 $/ = "\n";
321 return;
322 }
323
324 if ($self->{'encoding'} eq "unicode") {
325 my $line = "";
326 while (defined ($line = $self->read_line())) {
327 $$outputref .= $line;
328 }
329 return;
330 }
331
332 if ($self->{'encoding'} eq "gb") {
333 undef $/;
334 my $text = <$handle>;
335 $/ = "\n";
336 $$outputref .= &unicode::unicode2utf8 (&gb::gb2unicode ($text));
337 return;
338 }
339
340 if ($self->{'encoding'} eq "iso_8859_1") {
341 undef $/;
342 my $text = <$handle>;
343 $/ = "\n";
344 $$outputref .= &unicode::ascii2utf8 ($text);
345 return;
346 }
347
348 if ($self->{'encoding'} eq "iso_8859_6") {
349 my $text = <$handle>;
350 undef $/;
351 $/ = "\n";
352 $$outputref .= &unicode::unicode2utf8(&unicode::arabic2unicode ($text));
353 return;
354 }
355
356 if ($self->{'encoding'} eq "windows_1256") {
357 undef $/;
358 my $text = <$handle>;
359 $/ = "\n";
360 $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $text));
361 return;
362 }
363
364 if ($self->{'encoding'} eq "windows_1251") {
365 undef $/;
366 my $text = <$handle>;
367 $/ = "\n";
368 $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ("1251", $text));
369 return;
370 }
371}
372
373
3741;
Note: See TracBrowser for help on using the repository browser.