source: tags/gsdl-2_30d-distribution/gsdl/perllib/multiread.pm@ 2308

Last change on this file since 2308 was 1870, checked in by sjboddie, 23 years ago

Tidied up language support stuff.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.8 KB
Line 
1###########################################################################
2#
3# multiread.pm --
4#
5# Copyright (C) 1999 DigiLib Systems Limited, NZ
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20#
21###########################################################################
22
23# the multiread object will read in a number of encodings,
24# the results are always returned in the utf-8 format
25
26# encodings currently supported are
27#
28# utf8 - either utf8 or unicode (automatically detected)
29# unicode - just unicode (doesn't currently do endian detection)
30#
31# plus all encodings in the "encodings" package
32
33package multiread;
34
35use unicode;
36
37sub new {
38 my ($class) = @_;
39
40 my $self = {'handle' => "",
41 'first' => 1,
42 'encoding' => "utf8",
43 'bigendian' => 1};
44
45 return bless $self, $class;
46}
47
48# set_handle expects the file to be already open but
49# not read yet
50sub set_handle {
51 my $self = shift (@_);
52 ($self->{'handle'}) = @_;
53 $self->{'first'} = 1;
54 $self->{'encoding'} = "utf8";
55 $self->{'bigendian'} = 1;
56}
57
58# set_encoding should be called after set_handle
59sub set_encoding {
60 my $self = shift (@_);
61 ($self->{'encoding'}) = @_;
62}
63
64sub get_encoding {
65 my $self = shift (@_);
66 return $self->{'encoding'};
67}
68
69# undef will be returned if the eof has been reached
70# the result will always be returned in utf-8
71# if automatic detection between utf8 and unicode is desired
72# then the encoding should be initially set to utf8
73sub read_unicode_char {
74 my $self = shift (@_);
75
76 # make sure we have a file handle
77 return undef if ($self->{'handle'} eq "");
78 my $handle = $self->{'handle'};
79 binmode ($handle);
80
81 if ($self->{'encoding'} eq "utf8") {
82 # utf-8 text, how many characters we get depends
83 # on what we find
84 my $c1 = "";
85 my $c2 = "";
86 my $c3 = "";
87
88 while (!eof ($handle)) {
89 $c1 = ord (getc ($handle));
90 if ($self->{'first'}) {
91 $self->{'first'} = 0;
92
93 if ($c1 == 0xfe || $c1 == 0xff) {
94 $c2 = ord (getc ($handle)) if (!eof ($handle));
95
96 # if unicode fall through to the unicode reading code
97 if ($c1 == 0xff && $c2 == 0xfe) {
98 $self->{'encoding'} = "unicode";
99 $self->{'bigendian'} = 0;
100 last;
101
102 } elsif ($c1 == 0xfe && $c2 == 0xff) {
103 $self->{'encoding'} = "unicode";
104 $self->{'bigendian'} = 1;
105 last;
106 }
107
108 # an error, but we might be able to recover
109 # from it
110 $c1 = $c2;
111 }
112 }
113
114 if ($c1 <= 0x7f) {
115 # one byte character
116 return chr ($c1);
117
118 } elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
119 # two byte character
120 $c2 = getc ($handle) if (!eof ($handle));
121 return chr ($c1) . $c2;
122
123 } elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
124 # three byte character
125 $c2 = getc ($handle) if (!eof ($handle));
126 $c3 = getc ($handle) if (!eof ($handle));
127 return chr ($c1) . $c2 . $c3;
128 }
129
130 # if we get here there was an error in the file, we should
131 # be able to recover from it however, maybe the file is in
132 # another encoding
133 }
134
135 return undef if (eof ($handle));
136 }
137
138 if ($self->{'encoding'} eq "unicode") {
139 # unicode text, get the next two characters
140 return undef if (eof ($handle));
141 my $c1 = ord (getc ($handle));
142 return undef if (eof ($handle));
143 my $c2 = ord (getc ($handle));
144
145 return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
146 }
147
148 return undef;
149}
150
151
152# undef will be returned if the eof has been reached
153# the result will always be returned in utf-8
154sub read_line {
155 my $self = shift (@_);
156
157 # make sure we have a file handle
158 return undef if ($self->{'handle'} eq "");
159
160 my $handle = $self->{'handle'};
161
162 if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
163 # special case for the first line of utf8 text to detect whether
164 # the file is in utf8 or unicode
165 my $out = "";
166 my $thisc = "";
167 while (defined ($thisc = $self->read_unicode_char())) {
168 $out .= $thisc;
169 last if ($thisc eq "\n");
170 }
171
172 return $out if (length ($out) > 0);
173 return undef;
174 }
175
176 if ($self->{'encoding'} eq "utf8") {
177 # utf-8 line
178 return <$handle>;
179 }
180
181 if ($self->{'encoding'} eq "unicode") {
182 # unicode line
183 my $c = "";
184 my ($c1, $c2) = ("", "");
185 my $out = "";
186 while (read ($handle, $c, 2) == 2) {
187 $c1 = ord (substr ($c, 0, 1));
188 $c2 = ord (substr ($c, 1, 1));
189 $c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
190 $out .= $c;
191 last if ($c eq "\n");
192 }
193
194 return $out if (length ($out) > 0);
195 return undef;
196 }
197
198 if ($self->{'encoding'} eq "iso_8859_1") {
199 # we'll use ascii2utf8() for this as it's faster than going
200 # through convert2unicode()
201 my $line = "";
202 if (defined ($line = <$handle>)) {
203 return &unicode::ascii2utf8 (\$line);
204 }
205 }
206
207 # everything else uses unicode::convert2unicode
208 my $line = "";
209 if (defined ($line = <$handle>)) {
210 return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
211 }
212
213 return undef;
214}
215
216
217# will convert entire contents of file to utf8 and append result to $outputref
218# this may be a slightly faster way to get the contents of a file than by
219# recursively calling read_line()
220sub read_file {
221 my $self = shift (@_);
222 my ($outputref) = @_;
223
224 # make sure we have a file handle
225 return if ($self->{'handle'} eq "");
226
227 my $handle = $self->{'handle'};
228
229 if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
230 # special case for the first line of utf8 text to detect whether
231 # the file is in utf8 or unicode
232 $$text .= $self->read_line ();
233 }
234
235 if ($self->{'encoding'} eq "utf8") {
236 undef $/;
237 $$outputref .= <$handle>;
238 $/ = "\n";
239 return;
240 }
241
242 if ($self->{'encoding'} eq "unicode") {
243 my $line = "";
244 while (defined ($line = $self->read_line())) {
245 $$outputref .= $line;
246 }
247 return;
248 }
249
250 if ($self->{'encoding'} eq "iso_8859_1") {
251 # we'll use ascii2utf8() for this as it's faster than going
252 # through convert2unicode()
253 undef $/;
254 my $text = <$handle>;
255 $/ = "\n";
256 $$outputref .= &unicode::ascii2utf8 (\$text);
257 return;
258 }
259
260 # everything else uses unicode::convert2unicode
261 undef $/;
262 my $text = <$handle>;
263 $/ = "\n";
264 $$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text));
265}
266
2671;
Note: See TracBrowser for help on using the repository browser.