source: gsdl/trunk/perllib/multiread.pm@ 15894

Last change on this file since 15894 was 15894, checked in by mdewsnip, 16 years ago

Added "use strict" to the files missing it.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.1 KB
Line 
1###########################################################################
2#
3# multiread.pm --
4#
5# Copyright (C) 1999 DigiLib Systems Limited, NZ
6# Copyright (C) 2005 New Zealand Digital Library project
7#
8# This program is free software; you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation; either version 2 of the License, or
11# (at your option) any later version.
12#
13# This program is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with this program; if not, write to the Free Software
20# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21#
22###########################################################################
23
24# the multiread object will read in a number of encodings,
25# the results are always returned in the utf-8 format
26
27# encodings currently supported are
28#
29# utf8 - either utf8 or unicode (automatically detected)
30# unicode - 2-byte UCS (does endian detection)
31#
32# plus all encodings in the "encodings" package
33
34package multiread;
35
36eval {require bytes};
37
38use strict;
39use unicode;
40
41sub new {
42 my ($class) = @_;
43
44 my $self = {'handle' => "",
45 'first' => 1,
46 'encoding' => "utf8",
47 'bigendian' => 1};
48
49 return bless $self, $class;
50}
51
52# set_handle expects the file to be already open but
53# not read yet
54sub set_handle {
55 my $self = shift;
56 $self->{'handle'} = shift;
57 binmode( $self->{'handle'} );
58 $self->{'first'} = 1;
59 $self->{'encoding'} = "utf8";
60 $self->{'bigendian'} = 1;
61}
62
63# set_encoding should be called after set_handle
64sub set_encoding {
65 my $self = shift;
66 $self->{'encoding'} = shift;
67}
68
69sub get_encoding {
70 my $self = shift (@_);
71 return $self->{'encoding'};
72}
73
74# undef will be returned if the eof has been reached
75# the result will always be returned in utf-8
76
77sub read_unicode_char {
78 my $self = shift (@_);
79
80 # make sure we have a file handle
81 return undef if ($self->{'handle'} eq "");
82 my $handle = $self->{'handle'};
83
84 if ($self->{'encoding'} eq "utf8") {
85 # utf-8 text, how many characters we get depends
86 # on what we find
87 my $c1 = "";
88 my $c2 = "";
89 my $c3 = "";
90
91 while (!eof ($handle)) {
92 $c1 = ord (getc ($handle));
93
94 if ($c1 <= 0x7f) {
95 # one byte character
96 return chr ($c1);
97
98 } elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
99 # two byte character
100 $c2 = getc ($handle) if (!eof ($handle));
101 return chr ($c1) . $c2;
102
103 } elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
104 # three byte character
105 $c2 = getc ($handle) if (!eof ($handle));
106 $c3 = getc ($handle) if (!eof ($handle));
107 return chr ($c1) . $c2 . $c3;
108 }
109
110 # if we get here there was an error in the file, we should
111 # be able to recover from it however, maybe the file is in
112 # another encoding
113 }
114
115 return undef if (eof ($handle));
116 }
117
118 if ($self->{'encoding'} eq "unicode") {
119 # unicode text, get the next two characters
120 return undef if (eof ($handle));
121 my $c1 = ord (getc ($handle));
122 return undef if (eof ($handle));
123 my $c2 = ord (getc ($handle));
124
125 return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
126 }
127
128 return undef;
129}
130
131
132# undef will be returned if the eof has been reached
133# the result will always be returned in utf-8
134sub read_line {
135 my $self = shift (@_);
136
137 # make sure we have a file handle
138 return undef if ($self->{'handle'} eq "");
139
140 my $handle = $self->{'handle'};
141
142 if ($self->{'encoding'} eq "utf8") {
143 # utf-8 line
144 return <$handle>;
145 }
146
147 if ($self->{'encoding'} eq "unicode") {
148 # unicode line
149 my $c = "";
150 my ($c1, $c2) = ("", "");
151 my $out = "";
152 while (read ($handle, $c, 2) == 2) {
153 $c1 = ord (substr ($c, 0, 1));
154 $c2 = ord (substr ($c, 1, 1));
155 $c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
156 $out .= $c;
157 last if ($c eq "\n");
158 }
159
160 return $out if (length ($out) > 0);
161 return undef;
162 }
163
164 if ($self->{'encoding'} eq "iso_8859_1") {
165 # we'll use ascii2utf8() for this as it's faster than going
166 # through convert2unicode()
167 my $line = "";
168 if (defined ($line = <$handle>)) {
169 return &unicode::ascii2utf8 (\$line);
170 }
171 }
172
173 # everything else uses unicode::convert2unicode
174 my $line = "";
175 if (defined ($line = <$handle>)) {
176 return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
177 }
178
179 return undef;
180}
181
182
183
184# this will look for a Byte Order Marker at the start of the file, and
185# set the encoding appropriately if there is one, returning any
186# non-marker text on the first line (or returns undef).
187sub find_unicode_bom {
188 my $self=shift;
189
190 my $non_bom_text=""; # to return if we read in 'real' text
191
192 if ($self->{'first'} == 0) { return }
193
194 # make sure we have a file handle
195 return if ($self->{'handle'} eq "");
196 my $handle = $self->{'handle'};
197
198 $self->{'first'} = 0;
199
200 my $b1 = ord(getc ($handle));
201 my $b2;
202 my $b3;
203
204 if ($b1 == 0xfe || $b1 == 0xff) {
205 $b2 = ord (getc ($handle)) if (!eof ($handle));
206 if ($b1 == 0xff && $b2 == 0xfe) {
207 $self->{'encoding'} = "unicode";
208 $self->{'bigendian'} = 0;
209 return;
210 } elsif ($b1 == 0xfe && $b2 == 0xff) {
211 $self->{'encoding'} = "unicode";
212 $self->{'bigendian'} = 1;
213 return;
214 } elsif ($b1 == 0xef && $b2 == 0xbb) {
215 $b3 = ord(getc($handle));
216 if ($b3 == 0xbf) {
217 $self->{'encoding'} = "utf8";
218 $self->{'bigendian'} = 1;
219 return;
220 }
221 }
222 } else { # $b1 != fe or ff
223 $handle->ungetc($b1); return;
224 }
225 # if here, we have removed some chars and they aren't a BOM
226 if ($self->{'encoding'} eq "unicode") { # return the 2byte char
227 if (defined ($b3)) { # we looked at this... return it
228 $handle->ungetc($b3);
229 }
230 return &unicode::unicode2utf8([$self->{'bigendian'}?
231 ($b1*256+$b2) : ($b2*256+$b1)]);
232 }
233 # if here, it's utf-8
234 if ($b2 < 0x80) {
235 if (defined ($b3)) { # we grabbed this, but don't need it now
236 $handle->ungetc($b3);
237 }
238 return ($b1 . $b2);
239 }
240 # if here, we have taken part of a multi-byte char. we need to make
241 # sure we return the entire character
242 if (defined($b3) && $b3 < 0x80) { # we have all we need
243 $handle->ungetc($b3);
244 return ($b1 . $b2);
245 }
246 my $c=$b1.$b2.$b3;
247 my $b4=$handle->getc();
248 while ($b4 > 0x7f) { # note - this will return consecutive mb utf8 chars
249 $c .= $b4;
250 $b4=$handle->getc();
251 if (eof($handle)) { last }
252 }
253 if (! eof($handle)) {
254 $handle->ungetc($b4); # this byte is an ascii byte
255 }
256 return $c;
257}
258
259
260
261# will convert entire contents of file to utf8 and append result to $outputref
262# this may be a slightly faster way to get the contents of a file than by
263# recursively calling read_line()
264sub read_file {
265 my $self = shift (@_);
266 my ($outputref) = @_;
267
268 # make sure we have a file handle
269 return if ($self->{'handle'} eq "");
270
271 my $handle = $self->{'handle'};
272
273 # if encoding is set to utf8 or unicode, sniff to see if there is a
274 # byte order marker
275 if ($self->{'first'} &&
276 ($self->{'encoding'} eq "utf8" || $self->{'encoding'} eq 'unicode')) {
277 # this will change $self's encoding if there is a BOM
278 my $read_text = $self->find_unicode_bom();
279 $$outputref .= $read_text if (defined($read_text));
280 }
281
282 if ($self->{'encoding'} eq "utf8") {
283 undef $/;
284 $$outputref .= <$handle>;
285 $/ = "\n";
286 return;
287 }
288
289 if ($self->{'encoding'} eq "unicode") {
290 my $line = "";
291 while (defined ($line = $self->read_line())) {
292 $$outputref .= $line;
293 }
294 return;
295 }
296
297 if ($self->{'encoding'} eq "iso_8859_1" || $self->{'encoding'} eq "ascii") {
298 # we'll use ascii2utf8() for this as it's faster than going
299 # through convert2unicode()
300 undef $/;
301 my $text = <$handle>;
302 $/ = "\n";
303 $$outputref .= &unicode::ascii2utf8 (\$text);
304 return;
305 }
306
307 # everything else uses unicode::convert2unicode
308 undef $/;
309 my $text = <$handle>;
310 $/ = "\n";
311 $$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text));
312}
313
3141;
Note: See TracBrowser for help on using the repository browser.