source: trunk/gsdl/perllib/multiread.pm@ 1844

Last change on this file since 1844 was 1844, checked in by sjboddie, 23 years ago

Added an 'auto' argument to BasPlug's '-input_encoding' option ('auto' is
now the default instead of 'ascii'). Wihen -input_encoding is 'auto' textcat
is used to work out the language and encoding of each document prior to
processing it. This allows for documents within the same collection to be
in different encodings and all be imported correctly (as long as they're
in an encoding that's supported - notable exceptions at the moment are
Big5 Chinese and any kind of Japanese).
Doing things this way means each document is read in twice at import time,
no doubt slowing things down considerably. You can therefore still set
-input_encoding explicitly if you know that all your documents are a
particular encoding.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.4 KB
Line 
1###########################################################################
2#
3# multiread.pm --
4#
5# Copyright (C) 1999 DigiLib Systems Limited, NZ
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20#
21###########################################################################
22
23# the multiread object will read in a number of encodings,
24# the results are always returned in the utf-8 format
25
26# encodings currently supported are
27#
28# utf8 - either utf8 or unicode (automatically detected)
29# unicode - just unicode (doesn't currently do endian detection)
30# gb - GB
31# iso_8859_[1-9] - 8 bit extended ascii encodings
32# windows_125[0-6] - Windows codepages 1250 to 1256
33
34package multiread;
35
36use unicode;
37use gb;
38
39sub new {
40 my ($class) = @_;
41
42 my $self = {'handle' => "",
43 'first' => 1,
44 'encoding' => "utf8",
45 'bigendian' => 1};
46
47 return bless $self, $class;
48}
49
50# set_handle expects the file to be already open but
51# not read yet
52sub set_handle {
53 my $self = shift (@_);
54 ($self->{'handle'}) = @_;
55 $self->{'first'} = 1;
56 $self->{'encoding'} = "utf8";
57 $self->{'bigendian'} = 1;
58}
59
60# set_encoding should be called after set_handle
61sub set_encoding {
62 my $self = shift (@_);
63 ($self->{'encoding'}) = @_;
64}
65
66sub get_encoding {
67 my $self = shift (@_);
68 return $self->{'encoding'};
69}
70
71# undef will be returned if the eof has been reached
72# the result will always be returned in utf-8
73# if automatic detection between utf8 and unicode is desired
74# then the encoding should be initially set to utf8
75sub read_char {
76 my $self = shift (@_);
77
78 # make sure we have a file handle
79 return undef if ($self->{'handle'} eq "");
80 my $handle = $self->{'handle'};
81
82 if ($self->{'encoding'} eq "utf8") {
83 # utf-8 text, how many characters we get depends
84 # on what we find
85 my $c1 = "";
86 my $c2 = "";
87 my $c3 = "";
88
89 while (!eof ($handle)) {
90 $c1 = ord (getc ($handle));
91 if ($self->{'first'}) {
92 $self->{'first'} = 0;
93
94 if ($c1 == 0xfe || $c1 == 0xff) {
95 $c2 = ord (getc ($handle)) if (!eof ($handle));
96
97 # if unicode fall through to the unicode reading code
98 if ($c1 == 0xff && $c2 == 0xfe) {
99 $self->{'encoding'} = "unicode";
100 $self->{'bigendian'} = 0;
101 if ($ENV{'GSDLOS'} =~ /windows/i) {
102 binmode ($handle); # silly windows
103 }
104 last;
105
106 } elsif ($c1 == 0xfe && $c2 == 0xff) {
107 $self->{'encoding'} = "unicode";
108 $self->{'bigendian'} = 1;
109 if ($ENV{'GSDLOS'} =~ /windows/i) {
110 binmode ($handle); # silly windows
111 }
112 last;
113 }
114
115 # an error, but we might be able to recover
116 # from it
117 $c1 = $c2;
118 }
119 }
120
121 if ($c1 <= 0x7f) {
122 # one byte character
123 return chr ($c1);
124
125 } elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
126 # two byte character
127 $c2 = getc ($handle) if (!eof ($handle));
128 return chr ($c1) . $c2;
129
130 } elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
131 # three byte character
132 $c2 = getc ($handle) if (!eof ($handle));
133 $c3 = getc ($handle) if (!eof ($handle));
134 return chr ($c1) . $c2 . $c3;
135 }
136
137 # if we get here there was an error in the file, we should
138 # be able to recover from it however, maybe the file is in
139 # another encoding
140 }
141
142 return undef if (eof ($handle));
143 }
144
145 if ($self->{'encoding'} eq "unicode") {
146 # unicode text, get the next two characters
147 return undef if (eof ($handle));
148 my $c1 = ord (getc ($handle));
149 return undef if (eof ($handle));
150 my $c2 = ord (getc ($handle));
151
152 return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
153 }
154
155 if ($self->{'encoding'} eq "gb") {
156 # GB or GBK
157 return undef if (eof ($handle));
158 my $c1 = getc ($handle);
159 if (ord ($c1) >= 0x81) {
160 # double byte character
161 return undef if (eof ($handle));
162 my $c2 = getc ($handle);
163 return &unicode::unicode2utf8 (&gb::gb2unicode ($c1.$c2));
164
165 } else {
166 # single byte character
167 return &unicode::ascii2utf8 ($c1);
168 }
169 }
170
171 if ($self->{'encoding'} eq "iso_8859_1") {
172 # special case for iso_8859_1 as &ascii2utf8($char) is faster than
173 # &unicode2utf8(iso2unicode('1', $char))
174 return undef if (eof ($handle));
175 return &unicode::ascii2utf8 (getc ($handle));
176 }
177
178 if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) {
179 return undef if (eof ($handle));
180 return &unicode::unicode2utf8(&unicode::iso2unicode ($1, getc($handle)));
181 }
182
183 if ($self->{'encoding'} =~ /windows_(\d{4})$/) {
184 return undef if (eof ($handle));
185 return &unicode::unicode2utf8(&unicode::windows2unicode ($1, getc($handle)));
186 }
187
188 if ($self->{'encoding'} =~ /^koi8_[ru]$/) {
189 return undef if (eof ($handle));
190 return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, getc($handle)));
191 }
192
193 # unknown encoding
194 return undef;
195}
196
197
198# undef will be returned if the eof has been reached
199# the result will always be returned in utf-8
200sub read_line {
201 my $self = shift (@_);
202
203 # make sure we have a file handle
204 return undef if ($self->{'handle'} eq "");
205
206 my $handle = $self->{'handle'};
207
208 if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
209 # special case for the first line of utf8 text to detect whether
210 # the file is in utf8 or unicode
211 my $out = "";
212 my $thisc = "";
213 while (defined ($thisc = $self->read_char())) {
214 $out .= $thisc;
215 last if ($thisc eq "\n");
216 }
217
218 return $out if (length ($out) > 0);
219 return undef;
220 }
221
222
223 if ($self->{'encoding'} eq "utf8") {
224 # utf-8 line
225 return <$handle>;
226 }
227
228 if ($self->{'encoding'} eq "unicode") {
229 # unicode line
230 my $c = "";
231 my ($c1, $c2) = ("", "");
232 my $out = "";
233 while (read ($handle, $c, 2) == 2) {
234 $c1 = ord (substr ($c, 0, 1));
235 $c2 = ord (substr ($c, 1, 1));
236 $c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
237 $out .= $c;
238 last if ($c eq "\n");
239 }
240
241 return $out if (length ($out) > 0);
242 return undef;
243 }
244
245 if ($self->{'encoding'} eq "gb") {
246 # GB or GBK
247 my $line = "";
248 if (defined ($line = <$handle>)) {
249 return &unicode::unicode2utf8 (&gb::gb2unicode ($line));
250 }
251 return undef;
252 }
253
254 if ($self->{'encoding'} eq "iso_8859_1") {
255 # special case for iso_8859_1 as &ascii2utf8($line) is faster than
256 # &unicode2utf8(iso2unicode('1', $line))
257 my $line = "";
258 if (defined ($line = <$handle>)) {
259 return &unicode::ascii2utf8 ($line);
260 }
261 return undef;
262 }
263
264 if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) {
265 my $line = "";
266 if (defined ($line = <$handle>)) {
267 return &unicode::unicode2utf8(&unicode::iso2unicode ($1, $line));
268 }
269 return undef;
270 }
271
272 if ($self->{'encoding'} =~ /windows_(\d{4})$/) {
273 my $line = "";
274 if (defined ($line = <$handle>)) {
275 return &unicode::unicode2utf8(&unicode::windows2unicode ($1, $line));
276 }
277 return undef;
278 }
279
280 if ($self->{'encoding'} =~ /^koi8_[ru]$/) {
281 my $line = "";
282 if (defined ($line = <$handle>)) {
283 return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $line));
284 }
285 return undef;
286 }
287
288 # unknown encoding
289 return undef;
290}
291
292
293# will convert entire contents of file to utf8 and append result to $outputref
294# this may be a slightly faster way to get the contents of a file than by
295# recursively calling read_line()
296sub read_file {
297 my $self = shift (@_);
298 my ($outputref) = @_;
299
300 # make sure we have a file handle
301 return if ($self->{'handle'} eq "");
302
303 my $handle = $self->{'handle'};
304
305 if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
306 # special case for the first line of utf8 text to detect whether
307 # the file is in utf8 or unicode
308 $$text .= $self->read_line ();
309 }
310
311 if ($self->{'encoding'} eq "utf8") {
312 undef $/;
313 $$outputref .= <$handle>;
314 $/ = "\n";
315 return;
316 }
317
318 if ($self->{'encoding'} eq "unicode") {
319 my $line = "";
320 while (defined ($line = $self->read_line())) {
321 $$outputref .= $line;
322 }
323 return;
324 }
325
326 if ($self->{'encoding'} eq "gb") {
327 undef $/;
328 my $text = <$handle>;
329 $/ = "\n";
330 $$outputref .= &unicode::unicode2utf8 (&gb::gb2unicode ($text));
331 return;
332 }
333
334 if ($self->{'encoding'} eq "iso_8859_1") {
335 # special case for iso_8859_1 as &ascii2utf8($text) is faster than
336 # &unicode2utf8(iso2unicode('1', $text))
337 undef $/;
338 my $text = <$handle>;
339 $/ = "\n";
340 $$outputref .= &unicode::ascii2utf8 ($text);
341 return;
342 }
343
344 if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) {
345 undef $/;
346 my $text = <$handle>;
347 $/ = "\n";
348 $$outputref .= &unicode::unicode2utf8(&unicode::iso2unicode ($1, $text));
349 return;
350 }
351
352 if ($self->{'encoding'} =~ /windows_(\d{4})$/) {
353 undef $/;
354 my $text = <$handle>;
355 $/ = "\n";
356 $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ($1, $text));
357 return;
358 }
359
360 if ($self->{'encoding'} =~ /^koi8_[ru]$/) {
361 undef $/;
362 my $text = <$handle>;
363 $/ = "\n";
364 $$outputref .= &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $text));
365 return;
366 }
367}
368
3691;
Note: See TracBrowser for help on using the repository browser.