source: trunk/gsdl/perllib/multiread.pm@ 1227

Last change on this file since 1227 was 1227, checked in by sjboddie, 24 years ago

Modified the perl code for importing arabic encoded documents. Plugins
now support a windows_1256 and an iso_8859_6 encoding. I was briefly under
the impression that these two encodings were similar enough to be treated
the same. It turns out they're not. It appears that the Windows codepage
1256 is the most commonly used Arabic encoding so "arabic" is a synonym
for windows_1256.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.6 KB
Line 
1###########################################################################
2#
3# multiread.pm --
4#
5# Copyright (C) 1999 DigiLib Systems Limited, NZ
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20#
21###########################################################################
22
23# the multiread object will read in a number of encodings,
24# the results are always returned in the utf-8 format
25
26# encodings currently supported are
27#
28# utf8 - either utf8 or unicode (automatically detected)
29# unicode - just unicode (doesn't currently do endian detection)
30# gb - GB
31# iso_8859_1 - extended ascii (iso-8859-1)
32# iso_8859_6 - 8 bit arabic (iso-8859-6)
33# windows_1256 - Windows codepage 1256 (Arabic)
34
35package multiread;
36
37use unicode;
38use gb;
39
40sub new {
41 my ($class) = @_;
42
43 my $self = {'handle' => "",
44 'first' => 1,
45 'encoding' => "utf8",
46 'bigendian' => 1};
47
48 return bless $self, $class;
49}
50
51# set_handle expects the file to be already open but
52# not read yet
53sub set_handle {
54 my $self = shift (@_);
55 ($self->{'handle'}) = @_;
56 $self->{'first'} = 1;
57 $self->{'encoding'} = "utf8";
58 $self->{'bigendian'} = 1;
59}
60
61# set_encoding should be called after set_handle
62sub set_encoding {
63 my $self = shift (@_);
64 ($self->{'encoding'}) = @_;
65}
66
67sub get_encoding {
68 my $self = shift (@_);
69 return $self->{'encoding'};
70}
71
72# undef will be returned if the eof has been reached
73# the result will always be returned in utf-8
74# if automatic detection between utf8 and unicode is desired
75# then the encoding should be initially set to utf8
76sub read_char {
77 my $self = shift (@_);
78
79 # make sure we have a file handle
80 return undef if ($self->{'handle'} eq "");
81 my $handle = $self->{'handle'};
82
83 if ($self->{'encoding'} eq "utf8") {
84 # utf-8 text, how many characters we get depends
85 # on what we find
86 my $c1 = "";
87 my $c2 = "";
88 my $c3 = "";
89
90 while (!eof ($handle)) {
91 $c1 = ord (getc ($handle));
92 if ($self->{'first'}) {
93 $self->{'first'} = 0;
94
95 if ($c1 == 0xfe || $c1 == 0xff) {
96 $c2 = ord (getc ($handle)) if (!eof ($handle));
97
98 # if unicode fall through to the unicode reading code
99 if ($c1 == 0xff && $c2 == 0xfe) {
100 $self->{'encoding'} = "unicode";
101 $self->{'bigendian'} = 0;
102 if ($ENV{'GSDLOS'} =~ /windows/i) {
103 binmode ($handle); # silly windows
104 }
105 last;
106
107 } elsif ($c1 == 0xfe && $c2 == 0xff) {
108 $self->{'encoding'} = "unicode";
109 $self->{'bigendian'} = 1;
110 if ($ENV{'GSDLOS'} =~ /windows/i) {
111 binmode ($handle); # silly windows
112 }
113 last;
114 }
115
116 # an error, but we might be able to recover
117 # from it
118 $c1 = $c2;
119 }
120 }
121
122 if ($c1 <= 0x7f) {
123 # one byte character
124 return chr ($c1);
125
126 } elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
127 # two byte character
128 $c2 = getc ($handle) if (!eof ($handle));
129 return chr ($c1) . $c2;
130
131 } elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
132 # three byte character
133 $c2 = getc ($handle) if (!eof ($handle));
134 $c3 = getc ($handle) if (!eof ($handle));
135 return chr ($c1) . $c2 . $c3;
136 }
137
138 # if we get here there was an error in the file, we should
139 # be able to recover from it however, maybe the file is in
140 # another encoding
141 }
142
143 return undef if (eof ($handle));
144 }
145
146 if ($self->{'encoding'} eq "unicode") {
147 # unicode text, get the next two characters
148 return undef if (eof ($handle));
149 my $c1 = ord (getc ($handle));
150 return undef if (eof ($handle));
151 my $c2 = ord (getc ($handle));
152
153 return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
154 }
155
156 if ($self->{'encoding'} eq "gb") {
157 # GB or GBK
158 return undef if (eof ($handle));
159 my $c1 = getc ($handle);
160 if (ord ($c1) >= 0x81) {
161 # double byte character
162 return undef if (eof ($handle));
163 my $c2 = getc ($handle);
164 return &unicode::unicode2utf8 (&gb::gb2unicode ($c1.$c2));
165
166 } else {
167 # single byte character
168 return &unicode::ascii2utf8 ($c1);
169 }
170 }
171
172 if ($self->{'encoding'} eq "iso_8859_1") {
173 # Latin 1 extended ascii (ISO-8859-1)
174 return undef if (eof ($handle));
175 return &unicode::ascii2utf8 (getc ($handle));
176 }
177
178 if ($self->{'encoding'} eq "iso_8859_6") {
179 # 8 bit Arabic (IOS-8859-6)
180 return undef if (eof ($handle));
181 return &unicode::unicode2utf8(&unicode::arabic2unicode (getc ($handle)));
182 }
183
184 if ($self->{'encoding'} eq "windows_1256") {
185 # Windows 1256 (Arabic)
186 return undef if (eof ($handle));
187 return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", getc ($handle)));
188 }
189
190 # unknown encoding
191 return undef;
192}
193
194
195# undef will be returned if the eof has been reached
196# the result will always be returned in utf-8
197sub read_line {
198 my $self = shift (@_);
199
200 # make sure we have a file handle
201 return undef if ($self->{'handle'} eq "");
202
203 my $handle = $self->{'handle'};
204
205 if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
206 # special case for the first line of utf8 text to detect whether
207 # the file is in utf8 or unicode
208 my $out = "";
209 my $thisc = "";
210 while (defined ($thisc = $self->read_char())) {
211 $out .= $thisc;
212 last if ($thisc eq "\n");
213 }
214
215 return $out if (length ($out) > 0);
216 return undef;
217 }
218
219
220 if ($self->{'encoding'} eq "utf8") {
221 # utf-8 line
222 return <$handle>;
223 }
224
225 if ($self->{'encoding'} eq "unicode") {
226 # unicode line
227 my $c = "";
228 my ($c1, $c2) = ("", "");
229 my $out = "";
230 while (read ($handle, $c, 2) == 2) {
231 $c1 = ord (substr ($c, 0, 1));
232 $c2 = ord (substr ($c, 1, 1));
233 $c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
234 $out .= $c;
235 last if ($c eq "\n");
236 }
237
238 return $out if (length ($out) > 0);
239 return undef;
240 }
241
242 if ($self->{'encoding'} eq "gb") {
243 # GB or GBK
244 my $line = "";
245 if (defined ($line = <$handle>)) {
246 return &unicode::unicode2utf8 (&gb::gb2unicode ($line));
247 }
248 return undef;
249 }
250
251 if ($self->{'encoding'} eq "iso_8859_1") {
252 # extended ascii (ISO-8859-1)
253 my $line = "";
254 if (defined ($line = <$handle>)) {
255 return &unicode::ascii2utf8 ($line);
256 }
257 return undef;
258 }
259
260 if ($self->{'encoding'} eq "iso_8859_6") {
261 # 8 bit arabic (ISO-8859-6)
262 my $line = "";
263 if (defined ($line = <$handle>)) {
264 return &unicode::unicode2utf8(&unicode::arabic2unicode ($line));
265 }
266 return undef;
267 }
268
269 if ($self->{'encoding'} eq "windows_1256") {
270 # Windows 1256 (Arabic)
271 my $line = "";
272 if (defined ($line = <$handle>)) {
273 return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $line));
274 }
275 return undef;
276 }
277
278 # unknown encoding
279 return undef;
280}
281
282
283# will convert entire contents of file to utf8 and append result to $outputref
284# this may be a slightly faster way to get the contents of a file than by
285# recursively calling read_line()
286sub read_file {
287 my $self = shift (@_);
288 my ($outputref) = @_;
289
290 # make sure we have a file handle
291 return if ($self->{'handle'} eq "");
292
293 my $handle = $self->{'handle'};
294
295 if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
296 # special case for the first line of utf8 text to detect whether
297 # the file is in utf8 or unicode
298 $$text .= $self->read_line ();
299 }
300
301 if ($self->{'encoding'} eq "utf8") {
302 undef $/;
303 $$outputref .= <$handle>;
304 $/ = "\n";
305 return;
306 }
307
308 if ($self->{'encoding'} eq "unicode") {
309 my $line = "";
310 while (defined ($line = $self->read_line())) {
311 $$outputref .= $line;
312 }
313 return;
314 }
315
316 if ($self->{'encoding'} eq "gb") {
317 undef $/;
318 my $text = <$handle>;
319 $/ = "\n";
320 $$outputref .= &unicode::unicode2utf8 (&gb::gb2unicode ($text));
321 return;
322 }
323
324 if ($self->{'encoding'} eq "iso_8859_1") {
325 undef $/;
326 my $text = <$handle>;
327 $/ = "\n";
328 $$outputref .= &unicode::ascii2utf8 ($text);
329 return;
330 }
331
332 if ($self->{'encoding'} eq "iso_8859_6") {
333 my $text = <$handle>;
334 undef $/;
335 $/ = "\n";
336 $$outputref .= &unicode::unicode2utf8(&unicode::arabic2unicode ($text));
337 return;
338 }
339
340 if ($self->{'encoding'} eq "windows_1256") {
341 undef $/;
342 my $text = <$handle>;
343 $/ = "\n";
344 $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $text));
345 return;
346 }
347}
348
349
3501;
Note: See TracBrowser for help on using the repository browser.