source: main/trunk/greenstone2/perllib/multiread.pm@ 21207

Last change on this file since 21207 was 20579, checked in by davidb, 15 years ago

Unicode encoding broken down into two steps.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.6 KB
Line 
1###########################################################################
2#
3# multiread.pm --
4#
5# Copyright (C) 1999 DigiLib Systems Limited, NZ
6# Copyright (C) 2005 New Zealand Digital Library project
7#
8# This program is free software; you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation; either version 2 of the License, or
11# (at your option) any later version.
12#
13# This program is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with this program; if not, write to the Free Software
20# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21#
22###########################################################################
23
24# the multiread object will read in a number of encodings,
25# the results are always returned in the utf-8 format
26
27# encodings currently supported are
28#
29# utf8 - either utf8 or unicode (automatically detected)
30# unicode - 2-byte UCS (does endian detection)
31#
32# plus all encodings in the "encodings" package
33
34package multiread;
35
36eval {require bytes};
37
38use strict;
39no strict 'refs'; # allow filehandles to be variables and viceversa
40
41use unicode;
42
43sub new {
44 my ($class) = @_;
45
46 my $self = {'handle' => "",
47 'first' => 1,
48 'encoding' => "utf8",
49 'bigendian' => 1};
50
51 return bless $self, $class;
52}
53
54# set_handle expects the file to be already open but
55# not read yet
56sub set_handle {
57 my $self = shift;
58 $self->{'handle'} = shift;
59 binmode( $self->{'handle'} );
60 $self->{'first'} = 1;
61 $self->{'encoding'} = "utf8";
62 $self->{'bigendian'} = 1;
63}
64
65# set_encoding should be called after set_handle
66sub set_encoding {
67 my $self = shift;
68 $self->{'encoding'} = shift;
69}
70
71sub get_encoding {
72 my $self = shift (@_);
73 return $self->{'encoding'};
74}
75
76# undef will be returned if the eof has been reached
77# the result will always be returned in utf-8
78
79sub read_unicode_char {
80 my $self = shift (@_);
81
82 # make sure we have a file handle
83 return undef if ($self->{'handle'} eq "");
84 my $handle = $self->{'handle'};
85
86 if ($self->{'encoding'} eq "utf8") {
87 # utf-8 text, how many characters we get depends
88 # on what we find
89 my $c1 = "";
90 my $c2 = "";
91 my $c3 = "";
92
93 while (!eof ($handle)) {
94 $c1 = ord (getc ($handle));
95
96 if ($c1 <= 0x7f) {
97 # one byte character
98 return chr ($c1);
99
100 } elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
101 # two byte character
102 $c2 = getc ($handle) if (!eof ($handle));
103 return chr ($c1) . $c2;
104
105 } elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
106 # three byte character
107 $c2 = getc ($handle) if (!eof ($handle));
108 $c3 = getc ($handle) if (!eof ($handle));
109 return chr ($c1) . $c2 . $c3;
110 }
111
112 # if we get here there was an error in the file, we should
113 # be able to recover from it however, maybe the file is in
114 # another encoding
115 }
116
117 return undef if (eof ($handle));
118 }
119
120 if ($self->{'encoding'} eq "unicode") {
121 # unicode text, get the next two characters
122 return undef if (eof ($handle));
123 my $c1 = ord (getc ($handle));
124 return undef if (eof ($handle));
125 my $c2 = ord (getc ($handle));
126
127 return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
128 }
129
130 return undef;
131}
132
133
134sub unicodechar_to_ord
135{
136 my $self = shift (@_);
137 my ($unicode_text) = @_;
138
139 my $bigendian_ord_array = [];
140
141 my @unicodechar_array = ($unicode_text =~ m/(..)/g);
142
143 foreach my $pair (@unicodechar_array) {
144 # for each 2 byte pair
145 my $c1=ord(substr($pair,0,1));
146 my $c2=ord(substr($pair,1,1));
147
148 my $be_ord = ($self->{'bigendian'}) ? $c1*256+$c2 : $c2*256+$c1;
149 push(@$bigendian_ord_array,$be_ord);
150 }
151
152 return $bigendian_ord_array;
153}
154
155
156# undef will be returned if the eof has been reached
157# the result will always be returned in utf-8
158sub read_line {
159 my $self = shift (@_);
160
161 # make sure we have a file handle
162 return undef if ($self->{'handle'} eq "");
163
164 my $handle = $self->{'handle'};
165
166 if ($self->{'encoding'} eq "utf8") {
167 # utf-8 line
168 return <$handle>;
169 }
170
171 if ($self->{'encoding'} eq "unicode") {
172 # unicode line
173 my $c = "";
174 my ($c1, $c2) = ("", "");
175 my $out = "";
176 while (read ($handle, $c, 2) == 2) {
177 $c1 = ord (substr ($c, 0, 1));
178 $c2 = ord (substr ($c, 1, 1));
179 $c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
180 $out .= $c;
181 last if ($c eq "\n");
182 }
183
184 return $out if (length ($out) > 0);
185 return undef;
186 }
187
188 if ($self->{'encoding'} eq "iso_8859_1") {
189 # we'll use ascii2utf8() for this as it's faster than going
190 # through convert2unicode()
191 my $line = "";
192 if (defined ($line = <$handle>)) {
193 return &unicode::ascii2utf8 (\$line);
194 }
195 }
196
197 # everything else uses unicode::convert2unicode
198 my $line = "";
199 if (defined ($line = <$handle>)) {
200 return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
201 }
202
203 return undef;
204}
205
206
207
208# this will look for a Byte Order Marker at the start of the file, and
209# set the encoding appropriately if there is one, returning any
210# non-marker text on the first line (or returns undef).
211sub find_unicode_bom {
212 my $self=shift;
213
214 my $non_bom_text=""; # to return if we read in 'real' text
215
216 if ($self->{'first'} == 0) { return }
217
218 # make sure we have a file handle
219 return if ($self->{'handle'} eq "");
220 my $handle = $self->{'handle'};
221
222 $self->{'first'} = 0;
223
224 my $gc = getc ($handle);
225 my $b1 = ord($gc);
226 my $b2;
227 my $b3;
228
229 if ($b1 == 0xfe || $b1 == 0xff) {
230 $b2 = ord (getc ($handle)) if (!eof ($handle));
231 if ($b1 == 0xff && $b2 == 0xfe) {
232 $self->{'encoding'} = "unicode";
233 $self->{'bigendian'} = 0;
234 return;
235 } elsif ($b1 == 0xfe && $b2 == 0xff) {
236 $self->{'encoding'} = "unicode";
237 $self->{'bigendian'} = 1;
238 return;
239 } elsif ($b1 == 0xef && $b2 == 0xbb) {
240 $b3 = ord(getc($handle));
241 if ($b3 == 0xbf) {
242 $self->{'encoding'} = "utf8";
243 $self->{'bigendian'} = 1;
244 return;
245 }
246 else {
247 # put back all three bytes
248 $handle->ungetc($b3);
249 $handle->ungetc($b2);
250 $handle->ungetc($b1); return;
251
252 }
253 }
254 else {
255 # put back two bytes read
256 $handle->ungetc($b2);
257 $handle->ungetc($b1); return;
258 }
259 } else { # $b1 != fe or ff
260 # put back the one byte read
261 $handle->ungetc($b1); return;
262 }
263}
264
265
266sub read_file_no_decoding
267{
268 my $self = shift (@_);
269 my ($outputref) = @_;
270
271 # make sure we have a file handle
272 return if ($self->{'handle'} eq "");
273
274 my $handle = $self->{'handle'};
275
276 # if encoding is set to utf8 or unicode, sniff to see if there is a
277 # byte order marker
278 if ($self->{'first'} &&
279 ($self->{'encoding'} eq "utf8" || $self->{'encoding'} eq 'unicode')) {
280
281 # this will change $self's encoding if there is a BOM (but won't consume any characters)
282 $self->find_unicode_bom();
283 }
284
285 undef $/;
286 $$outputref .= <$handle>;
287 $/ = "\n";
288
289}
290
291
292# will convert entire contents of file to utf8 and append result to $outputref
293# this may be a slightly faster way to get the contents of a file than by
294# recursively calling read_line()
295sub decode_text {
296 my $self = shift (@_);
297
298 my ($raw_text,$decoded_text_ref) = @_;
299
300 if ($self->{'encoding'} eq "utf8") {
301 # Nothing to do, raw text is in utf 8
302 $$decoded_text_ref .= $raw_text;
303 return;
304 }
305
306 if ($self->{'encoding'} eq "unicode") {
307 my $unicode_array = $self->unicodechar_to_ord($raw_text);
308 $$decoded_text_ref .= &unicode::unicode2utf8($unicode_array);
309 return;
310 }
311
312 if ($self->{'encoding'} eq "iso_8859_1" || $self->{'encoding'} eq "ascii") {
313 # we'll use ascii2utf8() for this as it's faster than going
314 # through convert2unicode()
315 $$decoded_text_ref .= &unicode::ascii2utf8 (\$raw_text);
316 return;
317 }
318
319 # everything else uses unicode::convert2unicode
320 my $unicode_text = &unicode::convert2unicode ($self->{'encoding'}, \$raw_text);
321
322 $$decoded_text_ref .= &unicode::unicode2utf8 ($unicode_text);
323
324### print STDERR "!!! decoded ", join(":",map { ord($_) } split(//,$$decoded_text_ref)), "\n";
325}
326
327
328
329# will convert entire contents of file to utf8 and append result to $outputref
330# this may be a slightly faster way to get the contents of a file than by
331# recursively calling read_line()
332sub read_file {
333 my $self = shift (@_);
334 my ($outputref) = @_;
335
336 # While unusual, $raw_text is initialized to $$outputref
337 # to be consistent with code before refactoring
338 my $raw_text = $$outputref;
339
340 $self->read_file_no_decoding(\$raw_text);
341 $self->decode_text($raw_text,$outputref);
342}
343
3441;
Note: See TracBrowser for help on using the repository browser.