source: main/tags/2.52/gsdl/perllib/multiread.pm@ 25422

Last change on this file since 25422 was 3834, checked in by sjboddie, 21 years ago

Prevent "use bytes" from causing errors for older perls

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.9 KB
Line 
1###########################################################################
2#
3# multiread.pm --
4#
5# Copyright (C) 1999 DigiLib Systems Limited, NZ
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20#
21###########################################################################
22
23# the multiread object will read in a number of encodings,
24# the results are always returned in the utf-8 format
25
26# encodings currently supported are
27#
28# utf8 - either utf8 or unicode (automatically detected)
29# unicode - just unicode (doesn't currently do endian detection)
30#
31# plus all encodings in the "encodings" package
32
33package multiread;
34
35eval {require bytes};
36
37use unicode;
38
39sub new {
40 my ($class) = @_;
41
42 my $self = {'handle' => "",
43 'first' => 1,
44 'encoding' => "utf8",
45 'bigendian' => 1};
46
47 return bless $self, $class;
48}
49
50# set_handle expects the file to be already open but
51# not read yet
52sub set_handle {
53 my $self = shift (@_);
54 ($self->{'handle'}) = @_;
55 $self->{'first'} = 1;
56 $self->{'encoding'} = "utf8";
57 $self->{'bigendian'} = 1;
58}
59
60# set_encoding should be called after set_handle
61sub set_encoding {
62 my $self = shift (@_);
63 ($self->{'encoding'}) = @_;
64}
65
66sub get_encoding {
67 my $self = shift (@_);
68 return $self->{'encoding'};
69}
70
71# undef will be returned if the eof has been reached
72# the result will always be returned in utf-8
73# if automatic detection between utf8 and unicode is desired
74# then the encoding should be initially set to utf8
75sub read_unicode_char {
76 my $self = shift (@_);
77
78 # make sure we have a file handle
79 return undef if ($self->{'handle'} eq "");
80 my $handle = $self->{'handle'};
81 binmode ($handle);
82
83 if ($self->{'encoding'} eq "utf8") {
84 # utf-8 text, how many characters we get depends
85 # on what we find
86 my $c1 = "";
87 my $c2 = "";
88 my $c3 = "";
89
90 while (!eof ($handle)) {
91 $c1 = ord (getc ($handle));
92 if ($self->{'first'}) {
93 $self->{'first'} = 0;
94
95 if ($c1 == 0xfe || $c1 == 0xff) {
96 $c2 = ord (getc ($handle)) if (!eof ($handle));
97
98 # if unicode fall through to the unicode reading code
99 if ($c1 == 0xff && $c2 == 0xfe) {
100 $self->{'encoding'} = "unicode";
101 $self->{'bigendian'} = 0;
102 last;
103
104 } elsif ($c1 == 0xfe && $c2 == 0xff) {
105 $self->{'encoding'} = "unicode";
106 $self->{'bigendian'} = 1;
107 last;
108 }
109
110 # an error, but we might be able to recover
111 # from it
112 $c1 = $c2;
113 }
114 }
115
116 if ($c1 <= 0x7f) {
117 # one byte character
118 return chr ($c1);
119
120 } elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
121 # two byte character
122 $c2 = getc ($handle) if (!eof ($handle));
123 return chr ($c1) . $c2;
124
125 } elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
126 # three byte character
127 $c2 = getc ($handle) if (!eof ($handle));
128 $c3 = getc ($handle) if (!eof ($handle));
129 return chr ($c1) . $c2 . $c3;
130 }
131
132 # if we get here there was an error in the file, we should
133 # be able to recover from it however, maybe the file is in
134 # another encoding
135 }
136
137 return undef if (eof ($handle));
138 }
139
140 if ($self->{'encoding'} eq "unicode") {
141 # unicode text, get the next two characters
142 return undef if (eof ($handle));
143 my $c1 = ord (getc ($handle));
144 return undef if (eof ($handle));
145 my $c2 = ord (getc ($handle));
146
147 return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
148 }
149
150 return undef;
151}
152
153
154# undef will be returned if the eof has been reached
155# the result will always be returned in utf-8
156sub read_line {
157 my $self = shift (@_);
158
159 # make sure we have a file handle
160 return undef if ($self->{'handle'} eq "");
161
162 my $handle = $self->{'handle'};
163
164 if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
165 # special case for the first line of utf8 text to detect whether
166 # the file is in utf8 or unicode
167 my $out = "";
168 my $thisc = "";
169 while (defined ($thisc = $self->read_unicode_char())) {
170 $out .= $thisc;
171 last if ($thisc eq "\n");
172 }
173
174 return $out if (length ($out) > 0);
175 return undef;
176 }
177
178 if ($self->{'encoding'} eq "utf8") {
179 # utf-8 line
180 return <$handle>;
181 }
182
183 if ($self->{'encoding'} eq "unicode") {
184 # unicode line
185 my $c = "";
186 my ($c1, $c2) = ("", "");
187 my $out = "";
188 while (read ($handle, $c, 2) == 2) {
189 $c1 = ord (substr ($c, 0, 1));
190 $c2 = ord (substr ($c, 1, 1));
191 $c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
192 $out .= $c;
193 last if ($c eq "\n");
194 }
195
196 return $out if (length ($out) > 0);
197 return undef;
198 }
199
200 if ($self->{'encoding'} eq "iso_8859_1") {
201 # we'll use ascii2utf8() for this as it's faster than going
202 # through convert2unicode()
203 my $line = "";
204 if (defined ($line = <$handle>)) {
205 return &unicode::ascii2utf8 (\$line);
206 }
207 }
208
209 # everything else uses unicode::convert2unicode
210 my $line = "";
211 if (defined ($line = <$handle>)) {
212 return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
213 }
214
215 return undef;
216}
217
218
219# will convert entire contents of file to utf8 and append result to $outputref
220# this may be a slightly faster way to get the contents of a file than by
221# recursively calling read_line()
222sub read_file {
223 my $self = shift (@_);
224 my ($outputref) = @_;
225
226 # make sure we have a file handle
227 return if ($self->{'handle'} eq "");
228
229 my $handle = $self->{'handle'};
230
231 if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
232 # special case for the first line of utf8 text to detect whether
233 # the file is in utf8 or unicode
234
235 # possible to have no text here...
236 my $read_text = $self->read_line ();
237 $$outputref .= $read_text if (defined($read_text));
238 }
239
240 if ($self->{'encoding'} eq "utf8") {
241 undef $/;
242 $$outputref .= <$handle>;
243 $/ = "\n";
244 return;
245 }
246
247 if ($self->{'encoding'} eq "unicode") {
248 my $line = "";
249 while (defined ($line = $self->read_line())) {
250 $$outputref .= $line;
251 }
252 return;
253 }
254
255 if ($self->{'encoding'} eq "iso_8859_1") {
256 # we'll use ascii2utf8() for this as it's faster than going
257 # through convert2unicode()
258 undef $/;
259 my $text = <$handle>;
260 $/ = "\n";
261 $$outputref .= &unicode::ascii2utf8 (\$text);
262 return;
263 }
264
265 # everything else uses unicode::convert2unicode
266 undef $/;
267 my $text = <$handle>;
268 $/ = "\n";
269 $$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text));
270}
271
2721;
Note: See TracBrowser for help on using the repository browser.