source: gsdl/trunk/perllib/multiread.pm@ 14926

Last change on this file since 14926 was 12832, checked in by kjdon, 18 years ago

added in ascii casee in read_file - if not done specially, will be used in a call to unicode::unicode2utf8, which will fail as it doesn't accept ascii

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.1 KB
Line 
1###########################################################################
2#
3# multiread.pm --
4#
5# Copyright (C) 1999 DigiLib Systems Limited, NZ
6# Copyright (C) 2005 New Zealand Digital Library project
7#
8# This program is free software; you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation; either version 2 of the License, or
11# (at your option) any later version.
12#
13# This program is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with this program; if not, write to the Free Software
20# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21#
22###########################################################################
23
24# the multiread object will read in a number of encodings,
25# the results are always returned in the utf-8 format
26
27# encodings currently supported are
28#
29# utf8 - either utf8 or unicode (automatically detected)
30# unicode - 2-byte UCS (does endian detection)
31#
32# plus all encodings in the "encodings" package
33
34package multiread;
35
36eval {require bytes};
37
38use unicode;
39
40sub new {
41 my ($class) = @_;
42
43 my $self = {'handle' => "",
44 'first' => 1,
45 'encoding' => "utf8",
46 'bigendian' => 1};
47
48 return bless $self, $class;
49}
50
51# set_handle expects the file to be already open but
52# not read yet
53sub set_handle {
54 my $self = shift;
55 $self->{'handle'} = shift;
56 binmode( $self->{'handle'} );
57 $self->{'first'} = 1;
58 $self->{'encoding'} = "utf8";
59 $self->{'bigendian'} = 1;
60}
61
62# set_encoding should be called after set_handle
63sub set_encoding {
64 my $self = shift;
65 $self->{'encoding'} = shift;
66}
67
68sub get_encoding {
69 my $self = shift (@_);
70 return $self->{'encoding'};
71}
72
73# undef will be returned if the eof has been reached
74# the result will always be returned in utf-8
75
76sub read_unicode_char {
77 my $self = shift (@_);
78
79 # make sure we have a file handle
80 return undef if ($self->{'handle'} eq "");
81 my $handle = $self->{'handle'};
82
83 if ($self->{'encoding'} eq "utf8") {
84 # utf-8 text, how many characters we get depends
85 # on what we find
86 my $c1 = "";
87 my $c2 = "";
88 my $c3 = "";
89
90 while (!eof ($handle)) {
91 $c1 = ord (getc ($handle));
92
93 if ($c1 <= 0x7f) {
94 # one byte character
95 return chr ($c1);
96
97 } elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
98 # two byte character
99 $c2 = getc ($handle) if (!eof ($handle));
100 return chr ($c1) . $c2;
101
102 } elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
103 # three byte character
104 $c2 = getc ($handle) if (!eof ($handle));
105 $c3 = getc ($handle) if (!eof ($handle));
106 return chr ($c1) . $c2 . $c3;
107 }
108
109 # if we get here there was an error in the file, we should
110 # be able to recover from it however, maybe the file is in
111 # another encoding
112 }
113
114 return undef if (eof ($handle));
115 }
116
117 if ($self->{'encoding'} eq "unicode") {
118 # unicode text, get the next two characters
119 return undef if (eof ($handle));
120 my $c1 = ord (getc ($handle));
121 return undef if (eof ($handle));
122 my $c2 = ord (getc ($handle));
123
124 return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
125 }
126
127 return undef;
128}
129
130
131# undef will be returned if the eof has been reached
132# the result will always be returned in utf-8
133sub read_line {
134 my $self = shift (@_);
135
136 # make sure we have a file handle
137 return undef if ($self->{'handle'} eq "");
138
139 my $handle = $self->{'handle'};
140
141 if ($self->{'encoding'} eq "utf8") {
142 # utf-8 line
143 return <$handle>;
144 }
145
146 if ($self->{'encoding'} eq "unicode") {
147 # unicode line
148 my $c = "";
149 my ($c1, $c2) = ("", "");
150 my $out = "";
151 while (read ($handle, $c, 2) == 2) {
152 $c1 = ord (substr ($c, 0, 1));
153 $c2 = ord (substr ($c, 1, 1));
154 $c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
155 $out .= $c;
156 last if ($c eq "\n");
157 }
158
159 return $out if (length ($out) > 0);
160 return undef;
161 }
162
163 if ($self->{'encoding'} eq "iso_8859_1") {
164 # we'll use ascii2utf8() for this as it's faster than going
165 # through convert2unicode()
166 my $line = "";
167 if (defined ($line = <$handle>)) {
168 return &unicode::ascii2utf8 (\$line);
169 }
170 }
171
172 # everything else uses unicode::convert2unicode
173 my $line = "";
174 if (defined ($line = <$handle>)) {
175 return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
176 }
177
178 return undef;
179}
180
181
182
183# this will look for a Byte Order Marker at the start of the file, and
184# set the encoding appropriately if there is one, returning any
185# non-marker text on the first line (or returns undef).
186sub find_unicode_bom {
187 my $self=shift;
188
189 my $non_bom_text=""; # to return if we read in 'real' text
190
191 if ($self->{'first'} == 0) { return }
192
193 # make sure we have a file handle
194 return if ($self->{'handle'} eq "");
195 my $handle = $self->{'handle'};
196
197 $self->{'first'} = 0;
198
199 my $b1 = ord(getc ($handle));
200 my $b2;
201 my $b3;
202
203 if ($b1 == 0xfe || $b1 == 0xff) {
204 $b2 = ord (getc ($handle)) if (!eof ($handle));
205 if ($b1 == 0xff && $b2 == 0xfe) {
206 $self->{'encoding'} = "unicode";
207 $self->{'bigendian'} = 0;
208 return;
209 } elsif ($b1 == 0xfe && $b2 == 0xff) {
210 $self->{'encoding'} = "unicode";
211 $self->{'bigendian'} = 1;
212 return;
213 } elsif ($b1 == 0xef && $b2 == 0xbb) {
214 $b3 = ord(getc($handle));
215 if ($b3 == 0xbf) {
216 $self->{'encoding'} = "utf8";
217 $self->{'bigendian'} = 1;
218 return;
219 }
220 }
221 } else { # $b1 != fe or ff
222 $handle->ungetc($b1); return;
223 }
224 # if here, we have removed some chars and they aren't a BOM
225 if ($self->{'encoding'} eq "unicode") { # return the 2byte char
226 if (defined ($b3)) { # we looked at this... return it
227 $handle->ungetc($b3);
228 }
229 return &unicode::unicode2utf8([$self->{'bigendian'}?
230 ($b1*256+$b2) : ($b2*256+$b1)]);
231 }
232 # if here, it's utf-8
233 if ($b2 < 0x80) {
234 if (defined ($b3)) { # we grabbed this, but don't need it now
235 $handle->ungetc($b3);
236 }
237 return ($b1 . $b2);
238 }
239 # if here, we have taken part of a multi-byte char. we need to make
240 # sure we return the entire character
241 if (defined($b3) && $b3 < 0x80) { # we have all we need
242 $handle->ungetc($b3);
243 return ($b1 . $b2);
244 }
245 my $c=$b1.$b2.$b3;
246 my $b4=$handle->getc();
247 while ($b4 > 0x7f) { # note - this will return consecutive mb utf8 chars
248 $c .= $b4;
249 $b4=$handle->getc();
250 if (eof($handle)) { last }
251 }
252 if (! eof($handle)) {
253 $handle->ungetc($b4); # this byte is an ascii byte
254 }
255 return $c;
256}
257
258
259
260# will convert entire contents of file to utf8 and append result to $outputref
261# this may be a slightly faster way to get the contents of a file than by
262# recursively calling read_line()
263sub read_file {
264 my $self = shift (@_);
265 my ($outputref) = @_;
266
267 # make sure we have a file handle
268 return if ($self->{'handle'} eq "");
269
270 my $handle = $self->{'handle'};
271
272 # if encoding is set to utf8 or unicode, sniff to see if there is a
273 # byte order marker
274 if ($self->{'first'} &&
275 ($self->{'encoding'} eq "utf8" || $self->{'encoding'} eq 'unicode')) {
276 # this will change $self's encoding if there is a BOM
277 my $read_text = $self->find_unicode_bom();
278 $$outputref .= $read_text if (defined($read_text));
279 }
280
281 if ($self->{'encoding'} eq "utf8") {
282 undef $/;
283 $$outputref .= <$handle>;
284 $/ = "\n";
285 return;
286 }
287
288 if ($self->{'encoding'} eq "unicode") {
289 my $line = "";
290 while (defined ($line = $self->read_line())) {
291 $$outputref .= $line;
292 }
293 return;
294 }
295
296 if ($self->{'encoding'} eq "iso_8859_1" || $self->{'encoding'} eq "ascii") {
297 # we'll use ascii2utf8() for this as it's faster than going
298 # through convert2unicode()
299 undef $/;
300 my $text = <$handle>;
301 $/ = "\n";
302 $$outputref .= &unicode::ascii2utf8 (\$text);
303 return;
304 }
305
306 # everything else uses unicode::convert2unicode
307 undef $/;
308 my $text = <$handle>;
309 $/ = "\n";
310 $$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text));
311}
312
3131;
Note: See TracBrowser for help on using the repository browser.