Context Navigation

source: main/trunk/greenstone2/perllib/multiread.pm@ 21207

Last change on this file since 21207 was 20579, checked in by davidb, 15 years ago
Unicode encoding broken down into two steps.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 8.6 KB

Line
1	###########################################################################
2	#
3	# multiread.pm --
4	#
5	# Copyright (C) 1999 DigiLib Systems Limited, NZ
6	# Copyright (C) 2005 New Zealand Digital Library project
7	#
8	# This program is free software; you can redistribute it and/or modify
9	# it under the terms of the GNU General Public License as published by
10	# the Free Software Foundation; either version 2 of the License, or
11	# (at your option) any later version.
12	#
13	# This program is distributed in the hope that it will be useful,
14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	# GNU General Public License for more details.
17	#
18	# You should have received a copy of the GNU General Public License
19	# along with this program; if not, write to the Free Software
20	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21	#
22	###########################################################################
23
24	# the multiread object will read in a number of encodings,
25	# the results are always returned in the utf-8 format
26
27	# encodings currently supported are
28	#
29	# utf8 - either utf8 or unicode (automatically detected)
30	# unicode - 2-byte UCS (does endian detection)
31	#
32	# plus all encodings in the "encodings" package
33
34	package multiread;
35
36	eval {require bytes};
37
38	use strict;
39	no strict 'refs'; # allow filehandles to be variables and viceversa
40
41	use unicode;
42
43	sub new {
44	my ($class) = @_;
45
46	my $self = {'handle' => "",
47	'first' => 1,
48	'encoding' => "utf8",
49	'bigendian' => 1};
50
51	return bless $self, $class;
52	}
53
54	# set_handle expects the file to be already open but
55	# not read yet
56	sub set_handle {
57	my $self = shift;
58	$self->{'handle'} = shift;
59	binmode( $self->{'handle'} );
60	$self->{'first'} = 1;
61	$self->{'encoding'} = "utf8";
62	$self->{'bigendian'} = 1;
63	}
64
65	# set_encoding should be called after set_handle
66	sub set_encoding {
67	my $self = shift;
68	$self->{'encoding'} = shift;
69	}
70
71	sub get_encoding {
72	my $self = shift (@_);
73	return $self->{'encoding'};
74	}
75
76	# undef will be returned if the eof has been reached
77	# the result will always be returned in utf-8
78
79	sub read_unicode_char {
80	my $self = shift (@_);
81
82	# make sure we have a file handle
83	return undef if ($self->{'handle'} eq "");
84	my $handle = $self->{'handle'};
85
86	if ($self->{'encoding'} eq "utf8") {
87	# utf-8 text, how many characters we get depends
88	# on what we find
89	my $c1 = "";
90	my $c2 = "";
91	my $c3 = "";
92
93	while (!eof ($handle)) {
94	$c1 = ord (getc ($handle));
95
96	if ($c1 <= 0x7f) {
97	# one byte character
98	return chr ($c1);
99
100	} elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
101	# two byte character
102	$c2 = getc ($handle) if (!eof ($handle));
103	return chr ($c1) . $c2;
104
105	} elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
106	# three byte character
107	$c2 = getc ($handle) if (!eof ($handle));
108	$c3 = getc ($handle) if (!eof ($handle));
109	return chr ($c1) . $c2 . $c3;
110	}
111
112	# if we get here there was an error in the file, we should
113	# be able to recover from it however, maybe the file is in
114	# another encoding
115	}
116
117	return undef if (eof ($handle));
118	}
119
120	if ($self->{'encoding'} eq "unicode") {
121	# unicode text, get the next two characters
122	return undef if (eof ($handle));
123	my $c1 = ord (getc ($handle));
124	return undef if (eof ($handle));
125	my $c2 = ord (getc ($handle));
126
127	return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
128	}
129
130	return undef;
131	}
132
133
134	sub unicodechar_to_ord
135	{
136	my $self = shift (@_);
137	my ($unicode_text) = @_;
138
139	my $bigendian_ord_array = [];
140
141	my @unicodechar_array = ($unicode_text =~ m/(..)/g);
142
143	foreach my $pair (@unicodechar_array) {
144	# for each 2 byte pair
145	my $c1=ord(substr($pair,0,1));
146	my $c2=ord(substr($pair,1,1));
147
148	my $be_ord = ($self->{'bigendian'}) ? $c1256+$c2 : $c2256+$c1;
149	push(@$bigendian_ord_array,$be_ord);
150	}
151
152	return $bigendian_ord_array;
153	}
154
155
156	# undef will be returned if the eof has been reached
157	# the result will always be returned in utf-8
158	sub read_line {
159	my $self = shift (@_);
160
161	# make sure we have a file handle
162	return undef if ($self->{'handle'} eq "");
163
164	my $handle = $self->{'handle'};
165
166	if ($self->{'encoding'} eq "utf8") {
167	# utf-8 line
168	return <$handle>;
169	}
170
171	if ($self->{'encoding'} eq "unicode") {
172	# unicode line
173	my $c = "";
174	my ($c1, $c2) = ("", "");
175	my $out = "";
176	while (read ($handle, $c, 2) == 2) {
177	$c1 = ord (substr ($c, 0, 1));
178	$c2 = ord (substr ($c, 1, 1));
179	$c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
180	$out .= $c;
181	last if ($c eq "\n");
182	}
183
184	return $out if (length ($out) > 0);
185	return undef;
186	}
187
188	if ($self->{'encoding'} eq "iso_8859_1") {
189	# we'll use ascii2utf8() for this as it's faster than going
190	# through convert2unicode()
191	my $line = "";
192	if (defined ($line = <$handle>)) {
193	return &unicode::ascii2utf8 (\$line);
194	}
195	}
196
197	# everything else uses unicode::convert2unicode
198	my $line = "";
199	if (defined ($line = <$handle>)) {
200	return &unicode::unicode2utf8 (&unicode::convert2unicode ($self->{'encoding'}, \$line));
201	}
202
203	return undef;
204	}
205
206
207
208	# this will look for a Byte Order Marker at the start of the file, and
209	# set the encoding appropriately if there is one, returning any
210	# non-marker text on the first line (or returns undef).
211	sub find_unicode_bom {
212	my $self=shift;
213
214	my $non_bom_text=""; # to return if we read in 'real' text
215
216	if ($self->{'first'} == 0) { return }
217
218	# make sure we have a file handle
219	return if ($self->{'handle'} eq "");
220	my $handle = $self->{'handle'};
221
222	$self->{'first'} = 0;
223
224	my $gc = getc ($handle);
225	my $b1 = ord($gc);
226	my $b2;
227	my $b3;
228
229	if ($b1 == 0xfe \|\| $b1 == 0xff) {
230	$b2 = ord (getc ($handle)) if (!eof ($handle));
231	if ($b1 == 0xff && $b2 == 0xfe) {
232	$self->{'encoding'} = "unicode";
233	$self->{'bigendian'} = 0;
234	return;
235	} elsif ($b1 == 0xfe && $b2 == 0xff) {
236	$self->{'encoding'} = "unicode";
237	$self->{'bigendian'} = 1;
238	return;
239	} elsif ($b1 == 0xef && $b2 == 0xbb) {
240	$b3 = ord(getc($handle));
241	if ($b3 == 0xbf) {
242	$self->{'encoding'} = "utf8";
243	$self->{'bigendian'} = 1;
244	return;
245	}
246	else {
247	# put back all three bytes
248	$handle->ungetc($b3);
249	$handle->ungetc($b2);
250	$handle->ungetc($b1); return;
251
252	}
253	}
254	else {
255	# put back two bytes read
256	$handle->ungetc($b2);
257	$handle->ungetc($b1); return;
258	}
259	} else { # $b1 != fe or ff
260	# put back the one byte read
261	$handle->ungetc($b1); return;
262	}
263	}
264
265
266	sub read_file_no_decoding
267	{
268	my $self = shift (@_);
269	my ($outputref) = @_;
270
271	# make sure we have a file handle
272	return if ($self->{'handle'} eq "");
273
274	my $handle = $self->{'handle'};
275
276	# if encoding is set to utf8 or unicode, sniff to see if there is a
277	# byte order marker
278	if ($self->{'first'} &&
279	($self->{'encoding'} eq "utf8" \|\| $self->{'encoding'} eq 'unicode')) {
280
281	# this will change $self's encoding if there is a BOM (but won't consume any characters)
282	$self->find_unicode_bom();
283	}
284
285	undef $/;
286	$$outputref .= <$handle>;
287	$/ = "\n";
288
289	}
290
291
292	# will convert entire contents of file to utf8 and append result to $outputref
293	# this may be a slightly faster way to get the contents of a file than by
294	# recursively calling read_line()
295	sub decode_text {
296	my $self = shift (@_);
297
298	my ($raw_text,$decoded_text_ref) = @_;
299
300	if ($self->{'encoding'} eq "utf8") {
301	# Nothing to do, raw text is in utf 8
302	$$decoded_text_ref .= $raw_text;
303	return;
304	}
305
306	if ($self->{'encoding'} eq "unicode") {
307	my $unicode_array = $self->unicodechar_to_ord($raw_text);
308	$$decoded_text_ref .= &unicode::unicode2utf8($unicode_array);
309	return;
310	}
311
312	if ($self->{'encoding'} eq "iso_8859_1" \|\| $self->{'encoding'} eq "ascii") {
313	# we'll use ascii2utf8() for this as it's faster than going
314	# through convert2unicode()
315	$$decoded_text_ref .= &unicode::ascii2utf8 (\$raw_text);
316	return;
317	}
318
319	# everything else uses unicode::convert2unicode
320	my $unicode_text = &unicode::convert2unicode ($self->{'encoding'}, \$raw_text);
321
322	$$decoded_text_ref .= &unicode::unicode2utf8 ($unicode_text);
323
324	### print STDERR "!!! decoded ", join(":",map { ord($_) } split(//,$$decoded_text_ref)), "\n";
325	}
326
327
328
329	# will convert entire contents of file to utf8 and append result to $outputref
330	# this may be a slightly faster way to get the contents of a file than by
331	# recursively calling read_line()
332	sub read_file {
333	my $self = shift (@_);
334	my ($outputref) = @_;
335
336	# While unusual, $raw_text is initialized to $$outputref
337	# to be consistent with code before refactoring
338	my $raw_text = $$outputref;
339
340	$self->read_file_no_decoding(\$raw_text);
341	$self->decode_text($raw_text,$outputref);
342	}
343
344	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: