Context Navigation

source: trunk/gsdl/perllib/multiread.pm@ 1844

Last change on this file since 1844 was 1844, checked in by sjboddie, 23 years ago
Added an 'auto' argument to BasPlug's '-input_encoding' option ('auto' is now the default instead of 'ascii'). Wihen -input_encoding is 'auto' textcat is used to work out the language and encoding of each document prior to processing it. This allows for documents within the same collection to be in different encodings and all be imported correctly (as long as they're in an encoding that's supported - notable exceptions at the moment are Big5 Chinese and any kind of Japanese). Doing things this way means each document is read in twice at import time, no doubt slowing things down considerably. You can therefore still set -input_encoding explicitly if you know that all your documents are a particular encoding.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 9.4 KB

Line
1	###########################################################################
2	#
3	# multiread.pm --
4	#
5	# Copyright (C) 1999 DigiLib Systems Limited, NZ
6	#
7	# This program is free software; you can redistribute it and/or modify
8	# it under the terms of the GNU General Public License as published by
9	# the Free Software Foundation; either version 2 of the License, or
10	# (at your option) any later version.
11	#
12	# This program is distributed in the hope that it will be useful,
13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	# GNU General Public License for more details.
16	#
17	# You should have received a copy of the GNU General Public License
18	# along with this program; if not, write to the Free Software
19	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20	#
21	###########################################################################
22
23	# the multiread object will read in a number of encodings,
24	# the results are always returned in the utf-8 format
25
26	# encodings currently supported are
27	#
28	# utf8 - either utf8 or unicode (automatically detected)
29	# unicode - just unicode (doesn't currently do endian detection)
30	# gb - GB
31	# iso_8859_[1-9] - 8 bit extended ascii encodings
32	# windows_125[0-6] - Windows codepages 1250 to 1256
33
34	package multiread;
35
36	use unicode;
37	use gb;
38
39	sub new {
40	my ($class) = @_;
41
42	my $self = {'handle' => "",
43	'first' => 1,
44	'encoding' => "utf8",
45	'bigendian' => 1};
46
47	return bless $self, $class;
48	}
49
50	# set_handle expects the file to be already open but
51	# not read yet
52	sub set_handle {
53	my $self = shift (@_);
54	($self->{'handle'}) = @_;
55	$self->{'first'} = 1;
56	$self->{'encoding'} = "utf8";
57	$self->{'bigendian'} = 1;
58	}
59
60	# set_encoding should be called after set_handle
61	sub set_encoding {
62	my $self = shift (@_);
63	($self->{'encoding'}) = @_;
64	}
65
66	sub get_encoding {
67	my $self = shift (@_);
68	return $self->{'encoding'};
69	}
70
71	# undef will be returned if the eof has been reached
72	# the result will always be returned in utf-8
73	# if automatic detection between utf8 and unicode is desired
74	# then the encoding should be initially set to utf8
75	sub read_char {
76	my $self = shift (@_);
77
78	# make sure we have a file handle
79	return undef if ($self->{'handle'} eq "");
80	my $handle = $self->{'handle'};
81
82	if ($self->{'encoding'} eq "utf8") {
83	# utf-8 text, how many characters we get depends
84	# on what we find
85	my $c1 = "";
86	my $c2 = "";
87	my $c3 = "";
88
89	while (!eof ($handle)) {
90	$c1 = ord (getc ($handle));
91	if ($self->{'first'}) {
92	$self->{'first'} = 0;
93
94	if ($c1 == 0xfe \|\| $c1 == 0xff) {
95	$c2 = ord (getc ($handle)) if (!eof ($handle));
96
97	# if unicode fall through to the unicode reading code
98	if ($c1 == 0xff && $c2 == 0xfe) {
99	$self->{'encoding'} = "unicode";
100	$self->{'bigendian'} = 0;
101	if ($ENV{'GSDLOS'} =~ /windows/i) {
102	binmode ($handle); # silly windows
103	}
104	last;
105
106	} elsif ($c1 == 0xfe && $c2 == 0xff) {
107	$self->{'encoding'} = "unicode";
108	$self->{'bigendian'} = 1;
109	if ($ENV{'GSDLOS'} =~ /windows/i) {
110	binmode ($handle); # silly windows
111	}
112	last;
113	}
114
115	# an error, but we might be able to recover
116	# from it
117	$c1 = $c2;
118	}
119	}
120
121	if ($c1 <= 0x7f) {
122	# one byte character
123	return chr ($c1);
124
125	} elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
126	# two byte character
127	$c2 = getc ($handle) if (!eof ($handle));
128	return chr ($c1) . $c2;
129
130	} elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
131	# three byte character
132	$c2 = getc ($handle) if (!eof ($handle));
133	$c3 = getc ($handle) if (!eof ($handle));
134	return chr ($c1) . $c2 . $c3;
135	}
136
137	# if we get here there was an error in the file, we should
138	# be able to recover from it however, maybe the file is in
139	# another encoding
140	}
141
142	return undef if (eof ($handle));
143	}
144
145	if ($self->{'encoding'} eq "unicode") {
146	# unicode text, get the next two characters
147	return undef if (eof ($handle));
148	my $c1 = ord (getc ($handle));
149	return undef if (eof ($handle));
150	my $c2 = ord (getc ($handle));
151
152	return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
153	}
154
155	if ($self->{'encoding'} eq "gb") {
156	# GB or GBK
157	return undef if (eof ($handle));
158	my $c1 = getc ($handle);
159	if (ord ($c1) >= 0x81) {
160	# double byte character
161	return undef if (eof ($handle));
162	my $c2 = getc ($handle);
163	return &unicode::unicode2utf8 (&gb::gb2unicode ($c1.$c2));
164
165	} else {
166	# single byte character
167	return &unicode::ascii2utf8 ($c1);
168	}
169	}
170
171	if ($self->{'encoding'} eq "iso_8859_1") {
172	# special case for iso_8859_1 as &ascii2utf8($char) is faster than
173	# &unicode2utf8(iso2unicode('1', $char))
174	return undef if (eof ($handle));
175	return &unicode::ascii2utf8 (getc ($handle));
176	}
177
178	if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) {
179	return undef if (eof ($handle));
180	return &unicode::unicode2utf8(&unicode::iso2unicode ($1, getc($handle)));
181	}
182
183	if ($self->{'encoding'} =~ /windows_(\d{4})$/) {
184	return undef if (eof ($handle));
185	return &unicode::unicode2utf8(&unicode::windows2unicode ($1, getc($handle)));
186	}
187
188	if ($self->{'encoding'} =~ /^koi8_[ru]$/) {
189	return undef if (eof ($handle));
190	return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, getc($handle)));
191	}
192
193	# unknown encoding
194	return undef;
195	}
196
197
198	# undef will be returned if the eof has been reached
199	# the result will always be returned in utf-8
200	sub read_line {
201	my $self = shift (@_);
202
203	# make sure we have a file handle
204	return undef if ($self->{'handle'} eq "");
205
206	my $handle = $self->{'handle'};
207
208	if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
209	# special case for the first line of utf8 text to detect whether
210	# the file is in utf8 or unicode
211	my $out = "";
212	my $thisc = "";
213	while (defined ($thisc = $self->read_char())) {
214	$out .= $thisc;
215	last if ($thisc eq "\n");
216	}
217
218	return $out if (length ($out) > 0);
219	return undef;
220	}
221
222
223	if ($self->{'encoding'} eq "utf8") {
224	# utf-8 line
225	return <$handle>;
226	}
227
228	if ($self->{'encoding'} eq "unicode") {
229	# unicode line
230	my $c = "";
231	my ($c1, $c2) = ("", "");
232	my $out = "";
233	while (read ($handle, $c, 2) == 2) {
234	$c1 = ord (substr ($c, 0, 1));
235	$c2 = ord (substr ($c, 1, 1));
236	$c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
237	$out .= $c;
238	last if ($c eq "\n");
239	}
240
241	return $out if (length ($out) > 0);
242	return undef;
243	}
244
245	if ($self->{'encoding'} eq "gb") {
246	# GB or GBK
247	my $line = "";
248	if (defined ($line = <$handle>)) {
249	return &unicode::unicode2utf8 (&gb::gb2unicode ($line));
250	}
251	return undef;
252	}
253
254	if ($self->{'encoding'} eq "iso_8859_1") {
255	# special case for iso_8859_1 as &ascii2utf8($line) is faster than
256	# &unicode2utf8(iso2unicode('1', $line))
257	my $line = "";
258	if (defined ($line = <$handle>)) {
259	return &unicode::ascii2utf8 ($line);
260	}
261	return undef;
262	}
263
264	if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) {
265	my $line = "";
266	if (defined ($line = <$handle>)) {
267	return &unicode::unicode2utf8(&unicode::iso2unicode ($1, $line));
268	}
269	return undef;
270	}
271
272	if ($self->{'encoding'} =~ /windows_(\d{4})$/) {
273	my $line = "";
274	if (defined ($line = <$handle>)) {
275	return &unicode::unicode2utf8(&unicode::windows2unicode ($1, $line));
276	}
277	return undef;
278	}
279
280	if ($self->{'encoding'} =~ /^koi8_[ru]$/) {
281	my $line = "";
282	if (defined ($line = <$handle>)) {
283	return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $line));
284	}
285	return undef;
286	}
287
288	# unknown encoding
289	return undef;
290	}
291
292
293	# will convert entire contents of file to utf8 and append result to $outputref
294	# this may be a slightly faster way to get the contents of a file than by
295	# recursively calling read_line()
296	sub read_file {
297	my $self = shift (@_);
298	my ($outputref) = @_;
299
300	# make sure we have a file handle
301	return if ($self->{'handle'} eq "");
302
303	my $handle = $self->{'handle'};
304
305	if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
306	# special case for the first line of utf8 text to detect whether
307	# the file is in utf8 or unicode
308	$$text .= $self->read_line ();
309	}
310
311	if ($self->{'encoding'} eq "utf8") {
312	undef $/;
313	$$outputref .= <$handle>;
314	$/ = "\n";
315	return;
316	}
317
318	if ($self->{'encoding'} eq "unicode") {
319	my $line = "";
320	while (defined ($line = $self->read_line())) {
321	$$outputref .= $line;
322	}
323	return;
324	}
325
326	if ($self->{'encoding'} eq "gb") {
327	undef $/;
328	my $text = <$handle>;
329	$/ = "\n";
330	$$outputref .= &unicode::unicode2utf8 (&gb::gb2unicode ($text));
331	return;
332	}
333
334	if ($self->{'encoding'} eq "iso_8859_1") {
335	# special case for iso_8859_1 as &ascii2utf8($text) is faster than
336	# &unicode2utf8(iso2unicode('1', $text))
337	undef $/;
338	my $text = <$handle>;
339	$/ = "\n";
340	$$outputref .= &unicode::ascii2utf8 ($text);
341	return;
342	}
343
344	if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) {
345	undef $/;
346	my $text = <$handle>;
347	$/ = "\n";
348	$$outputref .= &unicode::unicode2utf8(&unicode::iso2unicode ($1, $text));
349	return;
350	}
351
352	if ($self->{'encoding'} =~ /windows_(\d{4})$/) {
353	undef $/;
354	my $text = <$handle>;
355	$/ = "\n";
356	$$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ($1, $text));
357	return;
358	}
359
360	if ($self->{'encoding'} =~ /^koi8_[ru]$/) {
361	undef $/;
362	my $text = <$handle>;
363	$/ = "\n";
364	$$outputref .= &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $text));
365	return;
366	}
367	}
368
369	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: