root/gsdl/trunk/perllib/multiread.pm @ 17110

Revision 16793, 8.5 KB (checked in by davidb, 11 years ago)

Double nested function call broken down in to two separate lines to help debugging

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# multiread.pm --
4#
5# Copyright (C) 1999 DigiLib Systems Limited, NZ
6# Copyright (C) 2005 New Zealand Digital Library project
7#
8# This program is free software; you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation; either version 2 of the License, or
11# (at your option) any later version.
12#
13# This program is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with this program; if not, write to the Free Software
20# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21#
22###########################################################################
23
24# the multiread object will read in a number of encodings,
25# the results are always returned in the utf-8 format
26
27# encodings currently supported are
28#
29# utf8             - either utf8 or unicode (automatically detected)
30# unicode          - 2-byte UCS (does endian detection)
31#
32# plus all encodings in the "encodings" package
33
34package multiread;
35
36eval {require bytes};
37
38use strict;
39no strict 'refs'; # allow filehandles to be variables and viceversa
40
41use unicode;
42
43sub new {
44    my ($class) = @_;
45
46    my $self = {'handle'    => "",
47        'first'     => 1,
48        'encoding'  => "utf8",
49        'bigendian' => 1};
50
51    return bless $self, $class;
52}
53
54# set_handle expects the file to be already open but
55# not read yet
56sub set_handle {
57    my $self = shift;
58    $self->{'handle'} = shift;
59    binmode( $self->{'handle'} );
60    $self->{'first'} = 1;
61    $self->{'encoding'} = "utf8";
62    $self->{'bigendian'} = 1;
63}
64
65# set_encoding should be called after set_handle
66sub set_encoding {
67    my $self = shift;
68    $self->{'encoding'} = shift;
69}
70
71sub get_encoding {
72    my $self = shift (@_);
73    return $self->{'encoding'};
74}
75
76# undef will be returned if the eof has been reached
77# the result will always be returned in utf-8
78
79sub read_unicode_char {
80    my $self = shift (@_);
81
82    # make sure we have a file handle
83    return undef if ($self->{'handle'} eq "");
84    my $handle = $self->{'handle'};
85
86    if ($self->{'encoding'} eq "utf8") {
87    # utf-8 text, how many characters we get depends
88    # on what we find
89    my $c1 = "";
90    my $c2 = "";
91    my $c3 = "";
92
93    while (!eof ($handle)) {
94        $c1 = ord (getc ($handle));
95
96        if ($c1 <= 0x7f) {
97        # one byte character
98        return chr ($c1);
99
100        } elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
101        # two byte character
102        $c2 = getc ($handle) if (!eof ($handle));
103        return chr ($c1) . $c2;
104
105        } elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
106        # three byte character
107        $c2 = getc ($handle) if (!eof ($handle));
108        $c3 = getc ($handle) if (!eof ($handle));
109        return chr ($c1) . $c2 . $c3;
110        }
111
112        # if we get here there was an error in the file, we should
113        # be able to recover from it however, maybe the file is in
114        # another encoding
115    }
116
117    return undef if (eof ($handle));
118    }
119
120    if ($self->{'encoding'} eq "unicode") {
121    # unicode text, get the next two characters
122    return undef if (eof ($handle));
123    my $c1 = ord (getc ($handle));
124    return undef if (eof ($handle));
125    my $c2 = ord (getc ($handle));
126
127    return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
128    }
129
130    return undef;
131}
132
133
134sub unicodechar_to_ord
135{
136    my $self = shift (@_);
137    my ($unicode_text) = @_;
138
139    my $bigendian_ord_array = [];
140
141    my @unicodechar_array = ($unicode_text =~ m/(..)/g);
142
143    foreach my $pair (@unicodechar_array) {
144    # for each 2 byte pair       
145    my $c1=ord(substr($pair,0,1));
146    my $c2=ord(substr($pair,1,1));
147
148    my $be_ord = ($self->{'bigendian'}) ? $c1*256+$c2 : $c2*256+$c1;
149    push(@$bigendian_ord_array,$be_ord);
150    }
151
152    return $bigendian_ord_array;
153}
154
155
156# undef will be returned if the eof has been reached
157# the result will always be returned in utf-8
158sub read_line {
159    my $self = shift (@_);
160
161    # make sure we have a file handle
162    return undef if ($self->{'handle'} eq "");
163
164    my $handle = $self->{'handle'};
165
166    if ($self->{'encoding'} eq "utf8") {
167    # utf-8 line
168    return <$handle>;
169    }
170
171    if ($self->{'encoding'} eq "unicode") {
172    # unicode line
173    my $c = "";
174    my ($c1, $c2) = ("", "");
175    my $out = "";
176    while (read ($handle, $c, 2) == 2) {
177        $c1 = ord (substr ($c, 0, 1));
178        $c2 = ord (substr ($c, 1, 1));
179        $c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
180        $out .= $c;
181        last if ($c eq "\n");
182    }
183
184    return $out if (length ($out) > 0);
185    return undef;
186    }
187
188    if ($self->{'encoding'} eq "iso_8859_1") {
189    # we'll use ascii2utf8() for this as it's faster than going
190    # through convert2unicode()
191    my $line = "";
192    if (defined ($line = <$handle>)) {
193        return &unicode::ascii2utf8 (\$line);
194    }
195    }
196
197    # everything else uses unicode::convert2unicode
198    my $line = "";
199    if (defined ($line = <$handle>)) {
200    return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
201    }
202
203    return undef;
204}
205
206
207
208# this will look for a Byte Order Marker at the start of the file, and
209# set the encoding appropriately if there is one, returning any
210# non-marker text on the first line (or returns undef).
211sub find_unicode_bom {
212    my $self=shift;
213
214    my $non_bom_text=""; # to return if we read in 'real' text
215
216    if ($self->{'first'} == 0) { return }
217
218    # make sure we have a file handle
219    return if ($self->{'handle'} eq "");
220    my $handle = $self->{'handle'};
221
222    $self->{'first'} = 0;
223
224    my $gc = getc ($handle);
225    my $b1 = ord($gc);
226    my $b2;
227    my $b3;
228
229    if ($b1 == 0xfe || $b1 == 0xff) {
230    $b2 = ord (getc ($handle)) if (!eof ($handle));
231    if ($b1 == 0xff && $b2 == 0xfe) {
232        $self->{'encoding'} = "unicode";
233        $self->{'bigendian'} = 0;
234        return;
235    } elsif ($b1 == 0xfe && $b2 == 0xff) {
236        $self->{'encoding'} = "unicode";
237        $self->{'bigendian'} = 1;
238        return;
239    } elsif ($b1 == 0xef && $b2 == 0xbb) {
240        $b3 = ord(getc($handle));
241        if ($b3 == 0xbf) {
242        $self->{'encoding'} = "utf8";
243        $self->{'bigendian'} = 1;
244        return;
245        }
246        else {
247        # put back all three bytes
248        $handle->ungetc($b3);
249        $handle->ungetc($b2);
250        $handle->ungetc($b1); return;
251
252        }
253    }
254    else {
255        # put back two bytes read
256        $handle->ungetc($b2);
257        $handle->ungetc($b1); return;
258    }
259    } else { # $b1 != fe or ff
260    # put back the one byte read
261    $handle->ungetc($b1); return;
262    }
263}
264
265
266sub read_file_no_decoding
267{
268    my $self = shift (@_);
269    my ($outputref) = @_;
270
271    # make sure we have a file handle
272    return if ($self->{'handle'} eq "");
273
274    my $handle = $self->{'handle'};
275
276    # if encoding is set to utf8 or unicode, sniff to see if there is a
277    # byte order marker
278    if ($self->{'first'} &&
279    ($self->{'encoding'} eq "utf8" || $self->{'encoding'} eq 'unicode')) {
280
281    # this will change $self's encoding if there is a BOM (but won't consume any characters)
282    $self->find_unicode_bom();
283    }
284
285    undef $/;
286    $$outputref .=  <$handle>;
287    $/ = "\n";
288}
289
290
291# will convert entire contents of file to utf8 and append result to $outputref
292# this may be a slightly faster way to get the contents of a file than by
293# recursively calling read_line()
294sub decode_text {
295    my $self = shift (@_);
296
297    my ($raw_text,$decoded_text_ref) = @_;
298
299    if ($self->{'encoding'} eq "utf8") {
300    # Nothing to do, raw text is in utf 8
301    $$decoded_text_ref .= $raw_text;
302    return;
303    }
304
305    if ($self->{'encoding'} eq "unicode") {
306    my $unicode_array = $self->unicodechar_to_ord($raw_text);
307    $$decoded_text_ref .= &unicode::unicode2utf8($unicode_array);
308    return;
309    }
310
311    if ($self->{'encoding'} eq "iso_8859_1" || $self->{'encoding'} eq "ascii") {
312    # we'll use ascii2utf8() for this as it's faster than going
313    # through convert2unicode()
314    $$decoded_text_ref .= &unicode::ascii2utf8 (\$raw_text);
315    return;
316    }
317
318    # everything else uses unicode::convert2unicode
319    $$decoded_text_ref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$raw_text));
320}
321
322
323
324# will convert entire contents of file to utf8 and append result to $outputref
325# this may be a slightly faster way to get the contents of a file than by
326# recursively calling read_line()
327sub read_file {
328    my $self = shift (@_);
329    my ($outputref) = @_;
330
331    # While unusual, $raw_text is initialized to $$outputref
332    # to be consistent with code before refactoring
333    my $raw_text = $$outputref;
334
335    $self->read_file_no_decoding(\$raw_text);
336    $self->decode_text($raw_text,$outputref);
337}
338
3391;
Note: See TracBrowser for help on using the browser.