source: gsdl/trunk/perllib/multiread.pm@ 18342

Last change on this file since 18342 was 16793, checked in by davidb, 16 years ago

Double nested function call broken down in to two separate lines to help debugging

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.5 KB
Line 
1###########################################################################
2#
3# multiread.pm --
4#
5# Copyright (C) 1999 DigiLib Systems Limited, NZ
6# Copyright (C) 2005 New Zealand Digital Library project
7#
8# This program is free software; you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation; either version 2 of the License, or
11# (at your option) any later version.
12#
13# This program is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with this program; if not, write to the Free Software
20# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21#
22###########################################################################
23
24# the multiread object will read in a number of encodings,
25# the results are always returned in the utf-8 format
26
27# encodings currently supported are
28#
29# utf8 - either utf8 or unicode (automatically detected)
30# unicode - 2-byte UCS (does endian detection)
31#
32# plus all encodings in the "encodings" package
33
34package multiread;
35
36eval {require bytes};
37
38use strict;
39no strict 'refs'; # allow filehandles to be variables and viceversa
40
41use unicode;
42
43sub new {
44 my ($class) = @_;
45
46 my $self = {'handle' => "",
47 'first' => 1,
48 'encoding' => "utf8",
49 'bigendian' => 1};
50
51 return bless $self, $class;
52}
53
54# set_handle expects the file to be already open but
55# not read yet
56sub set_handle {
57 my $self = shift;
58 $self->{'handle'} = shift;
59 binmode( $self->{'handle'} );
60 $self->{'first'} = 1;
61 $self->{'encoding'} = "utf8";
62 $self->{'bigendian'} = 1;
63}
64
65# set_encoding should be called after set_handle
66sub set_encoding {
67 my $self = shift;
68 $self->{'encoding'} = shift;
69}
70
71sub get_encoding {
72 my $self = shift (@_);
73 return $self->{'encoding'};
74}
75
76# undef will be returned if the eof has been reached
77# the result will always be returned in utf-8
78
79sub read_unicode_char {
80 my $self = shift (@_);
81
82 # make sure we have a file handle
83 return undef if ($self->{'handle'} eq "");
84 my $handle = $self->{'handle'};
85
86 if ($self->{'encoding'} eq "utf8") {
87 # utf-8 text, how many characters we get depends
88 # on what we find
89 my $c1 = "";
90 my $c2 = "";
91 my $c3 = "";
92
93 while (!eof ($handle)) {
94 $c1 = ord (getc ($handle));
95
96 if ($c1 <= 0x7f) {
97 # one byte character
98 return chr ($c1);
99
100 } elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
101 # two byte character
102 $c2 = getc ($handle) if (!eof ($handle));
103 return chr ($c1) . $c2;
104
105 } elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
106 # three byte character
107 $c2 = getc ($handle) if (!eof ($handle));
108 $c3 = getc ($handle) if (!eof ($handle));
109 return chr ($c1) . $c2 . $c3;
110 }
111
112 # if we get here there was an error in the file, we should
113 # be able to recover from it however, maybe the file is in
114 # another encoding
115 }
116
117 return undef if (eof ($handle));
118 }
119
120 if ($self->{'encoding'} eq "unicode") {
121 # unicode text, get the next two characters
122 return undef if (eof ($handle));
123 my $c1 = ord (getc ($handle));
124 return undef if (eof ($handle));
125 my $c2 = ord (getc ($handle));
126
127 return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
128 }
129
130 return undef;
131}
132
133
134sub unicodechar_to_ord
135{
136 my $self = shift (@_);
137 my ($unicode_text) = @_;
138
139 my $bigendian_ord_array = [];
140
141 my @unicodechar_array = ($unicode_text =~ m/(..)/g);
142
143 foreach my $pair (@unicodechar_array) {
144 # for each 2 byte pair
145 my $c1=ord(substr($pair,0,1));
146 my $c2=ord(substr($pair,1,1));
147
148 my $be_ord = ($self->{'bigendian'}) ? $c1*256+$c2 : $c2*256+$c1;
149 push(@$bigendian_ord_array,$be_ord);
150 }
151
152 return $bigendian_ord_array;
153}
154
155
156# undef will be returned if the eof has been reached
157# the result will always be returned in utf-8
158sub read_line {
159 my $self = shift (@_);
160
161 # make sure we have a file handle
162 return undef if ($self->{'handle'} eq "");
163
164 my $handle = $self->{'handle'};
165
166 if ($self->{'encoding'} eq "utf8") {
167 # utf-8 line
168 return <$handle>;
169 }
170
171 if ($self->{'encoding'} eq "unicode") {
172 # unicode line
173 my $c = "";
174 my ($c1, $c2) = ("", "");
175 my $out = "";
176 while (read ($handle, $c, 2) == 2) {
177 $c1 = ord (substr ($c, 0, 1));
178 $c2 = ord (substr ($c, 1, 1));
179 $c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
180 $out .= $c;
181 last if ($c eq "\n");
182 }
183
184 return $out if (length ($out) > 0);
185 return undef;
186 }
187
188 if ($self->{'encoding'} eq "iso_8859_1") {
189 # we'll use ascii2utf8() for this as it's faster than going
190 # through convert2unicode()
191 my $line = "";
192 if (defined ($line = <$handle>)) {
193 return &unicode::ascii2utf8 (\$line);
194 }
195 }
196
197 # everything else uses unicode::convert2unicode
198 my $line = "";
199 if (defined ($line = <$handle>)) {
200 return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
201 }
202
203 return undef;
204}
205
206
207
208# this will look for a Byte Order Marker at the start of the file, and
209# set the encoding appropriately if there is one, returning any
210# non-marker text on the first line (or returns undef).
211sub find_unicode_bom {
212 my $self=shift;
213
214 my $non_bom_text=""; # to return if we read in 'real' text
215
216 if ($self->{'first'} == 0) { return }
217
218 # make sure we have a file handle
219 return if ($self->{'handle'} eq "");
220 my $handle = $self->{'handle'};
221
222 $self->{'first'} = 0;
223
224 my $gc = getc ($handle);
225 my $b1 = ord($gc);
226 my $b2;
227 my $b3;
228
229 if ($b1 == 0xfe || $b1 == 0xff) {
230 $b2 = ord (getc ($handle)) if (!eof ($handle));
231 if ($b1 == 0xff && $b2 == 0xfe) {
232 $self->{'encoding'} = "unicode";
233 $self->{'bigendian'} = 0;
234 return;
235 } elsif ($b1 == 0xfe && $b2 == 0xff) {
236 $self->{'encoding'} = "unicode";
237 $self->{'bigendian'} = 1;
238 return;
239 } elsif ($b1 == 0xef && $b2 == 0xbb) {
240 $b3 = ord(getc($handle));
241 if ($b3 == 0xbf) {
242 $self->{'encoding'} = "utf8";
243 $self->{'bigendian'} = 1;
244 return;
245 }
246 else {
247 # put back all three bytes
248 $handle->ungetc($b3);
249 $handle->ungetc($b2);
250 $handle->ungetc($b1); return;
251
252 }
253 }
254 else {
255 # put back two bytes read
256 $handle->ungetc($b2);
257 $handle->ungetc($b1); return;
258 }
259 } else { # $b1 != fe or ff
260 # put back the one byte read
261 $handle->ungetc($b1); return;
262 }
263}
264
265
266sub read_file_no_decoding
267{
268 my $self = shift (@_);
269 my ($outputref) = @_;
270
271 # make sure we have a file handle
272 return if ($self->{'handle'} eq "");
273
274 my $handle = $self->{'handle'};
275
276 # if encoding is set to utf8 or unicode, sniff to see if there is a
277 # byte order marker
278 if ($self->{'first'} &&
279 ($self->{'encoding'} eq "utf8" || $self->{'encoding'} eq 'unicode')) {
280
281 # this will change $self's encoding if there is a BOM (but won't consume any characters)
282 $self->find_unicode_bom();
283 }
284
285 undef $/;
286 $$outputref .= <$handle>;
287 $/ = "\n";
288}
289
290
291# will convert entire contents of file to utf8 and append result to $outputref
292# this may be a slightly faster way to get the contents of a file than by
293# recursively calling read_line()
294sub decode_text {
295 my $self = shift (@_);
296
297 my ($raw_text,$decoded_text_ref) = @_;
298
299 if ($self->{'encoding'} eq "utf8") {
300 # Nothing to do, raw text is in utf 8
301 $$decoded_text_ref .= $raw_text;
302 return;
303 }
304
305 if ($self->{'encoding'} eq "unicode") {
306 my $unicode_array = $self->unicodechar_to_ord($raw_text);
307 $$decoded_text_ref .= &unicode::unicode2utf8($unicode_array);
308 return;
309 }
310
311 if ($self->{'encoding'} eq "iso_8859_1" || $self->{'encoding'} eq "ascii") {
312 # we'll use ascii2utf8() for this as it's faster than going
313 # through convert2unicode()
314 $$decoded_text_ref .= &unicode::ascii2utf8 (\$raw_text);
315 return;
316 }
317
318 # everything else uses unicode::convert2unicode
319 $$decoded_text_ref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$raw_text));
320}
321
322
323
324# will convert entire contents of file to utf8 and append result to $outputref
325# this may be a slightly faster way to get the contents of a file than by
326# recursively calling read_line()
327sub read_file {
328 my $self = shift (@_);
329 my ($outputref) = @_;
330
331 # While unusual, $raw_text is initialized to $$outputref
332 # to be consistent with code before refactoring
333 my $raw_text = $$outputref;
334
335 $self->read_file_no_decoding(\$raw_text);
336 $self->decode_text($raw_text,$outputref);
337}
338
3391;
Note: See TracBrowser for help on using the repository browser.