source: trunk/gsdl/perllib/multiread.pm@ 3537

Last change on this file since 3537 was 3520, checked in by jrm21, 22 years ago

wrong variable name meant we were throwing away the first line of each
document that's read using multiread:read_line... not very good.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.9 KB
Line 
1###########################################################################
2#
3# multiread.pm --
4#
5# Copyright (C) 1999 DigiLib Systems Limited, NZ
6#
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20#
21###########################################################################
22
23# the multiread object will read in a number of encodings,
24# the results are always returned in the utf-8 format
25
26# encodings currently supported are
27#
28# utf8 - either utf8 or unicode (automatically detected)
29# unicode - just unicode (doesn't currently do endian detection)
30#
31# plus all encodings in the "encodings" package
32
33package multiread;
34
35use unicode;
36
37sub new {
38 my ($class) = @_;
39
40 my $self = {'handle' => "",
41 'first' => 1,
42 'encoding' => "utf8",
43 'bigendian' => 1};
44
45 return bless $self, $class;
46}
47
48# set_handle expects the file to be already open but
49# not read yet
50sub set_handle {
51 my $self = shift (@_);
52 ($self->{'handle'}) = @_;
53 $self->{'first'} = 1;
54 $self->{'encoding'} = "utf8";
55 $self->{'bigendian'} = 1;
56}
57
58# set_encoding should be called after set_handle
59sub set_encoding {
60 my $self = shift (@_);
61 ($self->{'encoding'}) = @_;
62}
63
64sub get_encoding {
65 my $self = shift (@_);
66 return $self->{'encoding'};
67}
68
69# undef will be returned if the eof has been reached
70# the result will always be returned in utf-8
71# if automatic detection between utf8 and unicode is desired
72# then the encoding should be initially set to utf8
73sub read_unicode_char {
74 my $self = shift (@_);
75
76 # make sure we have a file handle
77 return undef if ($self->{'handle'} eq "");
78 my $handle = $self->{'handle'};
79 binmode ($handle);
80
81 if ($self->{'encoding'} eq "utf8") {
82 # utf-8 text, how many characters we get depends
83 # on what we find
84 my $c1 = "";
85 my $c2 = "";
86 my $c3 = "";
87
88 while (!eof ($handle)) {
89 $c1 = ord (getc ($handle));
90 if ($self->{'first'}) {
91 $self->{'first'} = 0;
92
93 if ($c1 == 0xfe || $c1 == 0xff) {
94 $c2 = ord (getc ($handle)) if (!eof ($handle));
95
96 # if unicode fall through to the unicode reading code
97 if ($c1 == 0xff && $c2 == 0xfe) {
98 $self->{'encoding'} = "unicode";
99 $self->{'bigendian'} = 0;
100 last;
101
102 } elsif ($c1 == 0xfe && $c2 == 0xff) {
103 $self->{'encoding'} = "unicode";
104 $self->{'bigendian'} = 1;
105 last;
106 }
107
108 # an error, but we might be able to recover
109 # from it
110 $c1 = $c2;
111 }
112 }
113
114 if ($c1 <= 0x7f) {
115 # one byte character
116 return chr ($c1);
117
118 } elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
119 # two byte character
120 $c2 = getc ($handle) if (!eof ($handle));
121 return chr ($c1) . $c2;
122
123 } elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
124 # three byte character
125 $c2 = getc ($handle) if (!eof ($handle));
126 $c3 = getc ($handle) if (!eof ($handle));
127 return chr ($c1) . $c2 . $c3;
128 }
129
130 # if we get here there was an error in the file, we should
131 # be able to recover from it however, maybe the file is in
132 # another encoding
133 }
134
135 return undef if (eof ($handle));
136 }
137
138 if ($self->{'encoding'} eq "unicode") {
139 # unicode text, get the next two characters
140 return undef if (eof ($handle));
141 my $c1 = ord (getc ($handle));
142 return undef if (eof ($handle));
143 my $c2 = ord (getc ($handle));
144
145 return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
146 }
147
148 return undef;
149}
150
151
152# undef will be returned if the eof has been reached
153# the result will always be returned in utf-8
154sub read_line {
155 my $self = shift (@_);
156
157 # make sure we have a file handle
158 return undef if ($self->{'handle'} eq "");
159
160 my $handle = $self->{'handle'};
161
162 if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
163 # special case for the first line of utf8 text to detect whether
164 # the file is in utf8 or unicode
165 my $out = "";
166 my $thisc = "";
167 while (defined ($thisc = $self->read_unicode_char())) {
168 $out .= $thisc;
169 last if ($thisc eq "\n");
170 }
171
172 return $out if (length ($out) > 0);
173 return undef;
174 }
175
176 if ($self->{'encoding'} eq "utf8") {
177 # utf-8 line
178 return <$handle>;
179 }
180
181 if ($self->{'encoding'} eq "unicode") {
182 # unicode line
183 my $c = "";
184 my ($c1, $c2) = ("", "");
185 my $out = "";
186 while (read ($handle, $c, 2) == 2) {
187 $c1 = ord (substr ($c, 0, 1));
188 $c2 = ord (substr ($c, 1, 1));
189 $c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1*256+$c2) : ($c2*256+$c1))]);
190 $out .= $c;
191 last if ($c eq "\n");
192 }
193
194 return $out if (length ($out) > 0);
195 return undef;
196 }
197
198 if ($self->{'encoding'} eq "iso_8859_1") {
199 # we'll use ascii2utf8() for this as it's faster than going
200 # through convert2unicode()
201 my $line = "";
202 if (defined ($line = <$handle>)) {
203 return &unicode::ascii2utf8 (\$line);
204 }
205 }
206
207 # everything else uses unicode::convert2unicode
208 my $line = "";
209 if (defined ($line = <$handle>)) {
210 return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
211 }
212
213 return undef;
214}
215
216
217# will convert entire contents of file to utf8 and append result to $outputref
218# this may be a slightly faster way to get the contents of a file than by
219# recursively calling read_line()
220sub read_file {
221 my $self = shift (@_);
222 my ($outputref) = @_;
223
224 # make sure we have a file handle
225 return if ($self->{'handle'} eq "");
226
227 my $handle = $self->{'handle'};
228
229 if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
230 # special case for the first line of utf8 text to detect whether
231 # the file is in utf8 or unicode
232
233 # possible to have no text here...
234 my $read_text = $self->read_line ();
235 $$outputref .= $read_text if (defined($read_text));
236 }
237
238 if ($self->{'encoding'} eq "utf8") {
239 undef $/;
240 $$outputref .= <$handle>;
241 $/ = "\n";
242 return;
243 }
244
245 if ($self->{'encoding'} eq "unicode") {
246 my $line = "";
247 while (defined ($line = $self->read_line())) {
248 $$outputref .= $line;
249 }
250 return;
251 }
252
253 if ($self->{'encoding'} eq "iso_8859_1") {
254 # we'll use ascii2utf8() for this as it's faster than going
255 # through convert2unicode()
256 undef $/;
257 my $text = <$handle>;
258 $/ = "\n";
259 $$outputref .= &unicode::ascii2utf8 (\$text);
260 return;
261 }
262
263 # everything else uses unicode::convert2unicode
264 undef $/;
265 my $text = <$handle>;
266 $/ = "\n";
267 $$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text));
268}
269
2701;
Note: See TracBrowser for help on using the repository browser.