Context Navigation

source: trunk/gsdl/perllib/multiread.pm@ 3537

Last change on this file since 3537 was 3520, checked in by jrm21, 22 years ago
wrong variable name meant we were throwing away the first line of each document that's read using multiread:read_line... not very good.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.9 KB

Line
1	###########################################################################
2	#
3	# multiread.pm --
4	#
5	# Copyright (C) 1999 DigiLib Systems Limited, NZ
6	#
7	# This program is free software; you can redistribute it and/or modify
8	# it under the terms of the GNU General Public License as published by
9	# the Free Software Foundation; either version 2 of the License, or
10	# (at your option) any later version.
11	#
12	# This program is distributed in the hope that it will be useful,
13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	# GNU General Public License for more details.
16	#
17	# You should have received a copy of the GNU General Public License
18	# along with this program; if not, write to the Free Software
19	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20	#
21	###########################################################################
22
23	# the multiread object will read in a number of encodings,
24	# the results are always returned in the utf-8 format
25
26	# encodings currently supported are
27	#
28	# utf8 - either utf8 or unicode (automatically detected)
29	# unicode - just unicode (doesn't currently do endian detection)
30	#
31	# plus all encodings in the "encodings" package
32
33	package multiread;
34
35	use unicode;
36
37	sub new {
38	my ($class) = @_;
39
40	my $self = {'handle' => "",
41	'first' => 1,
42	'encoding' => "utf8",
43	'bigendian' => 1};
44
45	return bless $self, $class;
46	}
47
48	# set_handle expects the file to be already open but
49	# not read yet
50	sub set_handle {
51	my $self = shift (@_);
52	($self->{'handle'}) = @_;
53	$self->{'first'} = 1;
54	$self->{'encoding'} = "utf8";
55	$self->{'bigendian'} = 1;
56	}
57
58	# set_encoding should be called after set_handle
59	sub set_encoding {
60	my $self = shift (@_);
61	($self->{'encoding'}) = @_;
62	}
63
64	sub get_encoding {
65	my $self = shift (@_);
66	return $self->{'encoding'};
67	}
68
69	# undef will be returned if the eof has been reached
70	# the result will always be returned in utf-8
71	# if automatic detection between utf8 and unicode is desired
72	# then the encoding should be initially set to utf8
73	sub read_unicode_char {
74	my $self = shift (@_);
75
76	# make sure we have a file handle
77	return undef if ($self->{'handle'} eq "");
78	my $handle = $self->{'handle'};
79	binmode ($handle);
80
81	if ($self->{'encoding'} eq "utf8") {
82	# utf-8 text, how many characters we get depends
83	# on what we find
84	my $c1 = "";
85	my $c2 = "";
86	my $c3 = "";
87
88	while (!eof ($handle)) {
89	$c1 = ord (getc ($handle));
90	if ($self->{'first'}) {
91	$self->{'first'} = 0;
92
93	if ($c1 == 0xfe \|\| $c1 == 0xff) {
94	$c2 = ord (getc ($handle)) if (!eof ($handle));
95
96	# if unicode fall through to the unicode reading code
97	if ($c1 == 0xff && $c2 == 0xfe) {
98	$self->{'encoding'} = "unicode";
99	$self->{'bigendian'} = 0;
100	last;
101
102	} elsif ($c1 == 0xfe && $c2 == 0xff) {
103	$self->{'encoding'} = "unicode";
104	$self->{'bigendian'} = 1;
105	last;
106	}
107
108	# an error, but we might be able to recover
109	# from it
110	$c1 = $c2;
111	}
112	}
113
114	if ($c1 <= 0x7f) {
115	# one byte character
116	return chr ($c1);
117
118	} elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
119	# two byte character
120	$c2 = getc ($handle) if (!eof ($handle));
121	return chr ($c1) . $c2;
122
123	} elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
124	# three byte character
125	$c2 = getc ($handle) if (!eof ($handle));
126	$c3 = getc ($handle) if (!eof ($handle));
127	return chr ($c1) . $c2 . $c3;
128	}
129
130	# if we get here there was an error in the file, we should
131	# be able to recover from it however, maybe the file is in
132	# another encoding
133	}
134
135	return undef if (eof ($handle));
136	}
137
138	if ($self->{'encoding'} eq "unicode") {
139	# unicode text, get the next two characters
140	return undef if (eof ($handle));
141	my $c1 = ord (getc ($handle));
142	return undef if (eof ($handle));
143	my $c2 = ord (getc ($handle));
144
145	return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
146	}
147
148	return undef;
149	}
150
151
152	# undef will be returned if the eof has been reached
153	# the result will always be returned in utf-8
154	sub read_line {
155	my $self = shift (@_);
156
157	# make sure we have a file handle
158	return undef if ($self->{'handle'} eq "");
159
160	my $handle = $self->{'handle'};
161
162	if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
163	# special case for the first line of utf8 text to detect whether
164	# the file is in utf8 or unicode
165	my $out = "";
166	my $thisc = "";
167	while (defined ($thisc = $self->read_unicode_char())) {
168	$out .= $thisc;
169	last if ($thisc eq "\n");
170	}
171
172	return $out if (length ($out) > 0);
173	return undef;
174	}
175
176	if ($self->{'encoding'} eq "utf8") {
177	# utf-8 line
178	return <$handle>;
179	}
180
181	if ($self->{'encoding'} eq "unicode") {
182	# unicode line
183	my $c = "";
184	my ($c1, $c2) = ("", "");
185	my $out = "";
186	while (read ($handle, $c, 2) == 2) {
187	$c1 = ord (substr ($c, 0, 1));
188	$c2 = ord (substr ($c, 1, 1));
189	$c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
190	$out .= $c;
191	last if ($c eq "\n");
192	}
193
194	return $out if (length ($out) > 0);
195	return undef;
196	}
197
198	if ($self->{'encoding'} eq "iso_8859_1") {
199	# we'll use ascii2utf8() for this as it's faster than going
200	# through convert2unicode()
201	my $line = "";
202	if (defined ($line = <$handle>)) {
203	return &unicode::ascii2utf8 (\$line);
204	}
205	}
206
207	# everything else uses unicode::convert2unicode
208	my $line = "";
209	if (defined ($line = <$handle>)) {
210	return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
211	}
212
213	return undef;
214	}
215
216
217	# will convert entire contents of file to utf8 and append result to $outputref
218	# this may be a slightly faster way to get the contents of a file than by
219	# recursively calling read_line()
220	sub read_file {
221	my $self = shift (@_);
222	my ($outputref) = @_;
223
224	# make sure we have a file handle
225	return if ($self->{'handle'} eq "");
226
227	my $handle = $self->{'handle'};
228
229	if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
230	# special case for the first line of utf8 text to detect whether
231	# the file is in utf8 or unicode
232
233	# possible to have no text here...
234	my $read_text = $self->read_line ();
235	$$outputref .= $read_text if (defined($read_text));
236	}
237
238	if ($self->{'encoding'} eq "utf8") {
239	undef $/;
240	$$outputref .= <$handle>;
241	$/ = "\n";
242	return;
243	}
244
245	if ($self->{'encoding'} eq "unicode") {
246	my $line = "";
247	while (defined ($line = $self->read_line())) {
248	$$outputref .= $line;
249	}
250	return;
251	}
252
253	if ($self->{'encoding'} eq "iso_8859_1") {
254	# we'll use ascii2utf8() for this as it's faster than going
255	# through convert2unicode()
256	undef $/;
257	my $text = <$handle>;
258	$/ = "\n";
259	$$outputref .= &unicode::ascii2utf8 (\$text);
260	return;
261	}
262
263	# everything else uses unicode::convert2unicode
264	undef $/;
265	my $text = <$handle>;
266	$/ = "\n";
267	$$outputref .= &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$text));
268	}
269
270	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: