Context Navigation

source: trunk/gsdl/perllib/multiread.pm@ 1483

Last change on this file since 1483 was 1227, checked in by sjboddie, 24 years ago
Modified the perl code for importing arabic encoded documents. Plugins now support a windows_1256 and an iso_8859_6 encoding. I was briefly under the impression that these two encodings were similar enough to be treated the same. It turns out they're not. It appears that the Windows codepage 1256 is the most commonly used Arabic encoding so "arabic" is a synonym for windows_1256.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 8.6 KB

Line
1	###########################################################################
2	#
3	# multiread.pm --
4	#
5	# Copyright (C) 1999 DigiLib Systems Limited, NZ
6	#
7	# This program is free software; you can redistribute it and/or modify
8	# it under the terms of the GNU General Public License as published by
9	# the Free Software Foundation; either version 2 of the License, or
10	# (at your option) any later version.
11	#
12	# This program is distributed in the hope that it will be useful,
13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	# GNU General Public License for more details.
16	#
17	# You should have received a copy of the GNU General Public License
18	# along with this program; if not, write to the Free Software
19	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20	#
21	###########################################################################
22
23	# the multiread object will read in a number of encodings,
24	# the results are always returned in the utf-8 format
25
26	# encodings currently supported are
27	#
28	# utf8 - either utf8 or unicode (automatically detected)
29	# unicode - just unicode (doesn't currently do endian detection)
30	# gb - GB
31	# iso_8859_1 - extended ascii (iso-8859-1)
32	# iso_8859_6 - 8 bit arabic (iso-8859-6)
33	# windows_1256 - Windows codepage 1256 (Arabic)
34
35	package multiread;
36
37	use unicode;
38	use gb;
39
40	sub new {
41	my ($class) = @_;
42
43	my $self = {'handle' => "",
44	'first' => 1,
45	'encoding' => "utf8",
46	'bigendian' => 1};
47
48	return bless $self, $class;
49	}
50
51	# set_handle expects the file to be already open but
52	# not read yet
53	sub set_handle {
54	my $self = shift (@_);
55	($self->{'handle'}) = @_;
56	$self->{'first'} = 1;
57	$self->{'encoding'} = "utf8";
58	$self->{'bigendian'} = 1;
59	}
60
61	# set_encoding should be called after set_handle
62	sub set_encoding {
63	my $self = shift (@_);
64	($self->{'encoding'}) = @_;
65	}
66
67	sub get_encoding {
68	my $self = shift (@_);
69	return $self->{'encoding'};
70	}
71
72	# undef will be returned if the eof has been reached
73	# the result will always be returned in utf-8
74	# if automatic detection between utf8 and unicode is desired
75	# then the encoding should be initially set to utf8
76	sub read_char {
77	my $self = shift (@_);
78
79	# make sure we have a file handle
80	return undef if ($self->{'handle'} eq "");
81	my $handle = $self->{'handle'};
82
83	if ($self->{'encoding'} eq "utf8") {
84	# utf-8 text, how many characters we get depends
85	# on what we find
86	my $c1 = "";
87	my $c2 = "";
88	my $c3 = "";
89
90	while (!eof ($handle)) {
91	$c1 = ord (getc ($handle));
92	if ($self->{'first'}) {
93	$self->{'first'} = 0;
94
95	if ($c1 == 0xfe \|\| $c1 == 0xff) {
96	$c2 = ord (getc ($handle)) if (!eof ($handle));
97
98	# if unicode fall through to the unicode reading code
99	if ($c1 == 0xff && $c2 == 0xfe) {
100	$self->{'encoding'} = "unicode";
101	$self->{'bigendian'} = 0;
102	if ($ENV{'GSDLOS'} =~ /windows/i) {
103	binmode ($handle); # silly windows
104	}
105	last;
106
107	} elsif ($c1 == 0xfe && $c2 == 0xff) {
108	$self->{'encoding'} = "unicode";
109	$self->{'bigendian'} = 1;
110	if ($ENV{'GSDLOS'} =~ /windows/i) {
111	binmode ($handle); # silly windows
112	}
113	last;
114	}
115
116	# an error, but we might be able to recover
117	# from it
118	$c1 = $c2;
119	}
120	}
121
122	if ($c1 <= 0x7f) {
123	# one byte character
124	return chr ($c1);
125
126	} elsif ($c1 >= 0xc0 && $c1 <= 0xdf) {
127	# two byte character
128	$c2 = getc ($handle) if (!eof ($handle));
129	return chr ($c1) . $c2;
130
131	} elsif ($c1 >= 0xe0 && $c1 <= 0xef) {
132	# three byte character
133	$c2 = getc ($handle) if (!eof ($handle));
134	$c3 = getc ($handle) if (!eof ($handle));
135	return chr ($c1) . $c2 . $c3;
136	}
137
138	# if we get here there was an error in the file, we should
139	# be able to recover from it however, maybe the file is in
140	# another encoding
141	}
142
143	return undef if (eof ($handle));
144	}
145
146	if ($self->{'encoding'} eq "unicode") {
147	# unicode text, get the next two characters
148	return undef if (eof ($handle));
149	my $c1 = ord (getc ($handle));
150	return undef if (eof ($handle));
151	my $c2 = ord (getc ($handle));
152
153	return &unicode::unicode2utf8 ([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
154	}
155
156	if ($self->{'encoding'} eq "gb") {
157	# GB or GBK
158	return undef if (eof ($handle));
159	my $c1 = getc ($handle);
160	if (ord ($c1) >= 0x81) {
161	# double byte character
162	return undef if (eof ($handle));
163	my $c2 = getc ($handle);
164	return &unicode::unicode2utf8 (&gb::gb2unicode ($c1.$c2));
165
166	} else {
167	# single byte character
168	return &unicode::ascii2utf8 ($c1);
169	}
170	}
171
172	if ($self->{'encoding'} eq "iso_8859_1") {
173	# Latin 1 extended ascii (ISO-8859-1)
174	return undef if (eof ($handle));
175	return &unicode::ascii2utf8 (getc ($handle));
176	}
177
178	if ($self->{'encoding'} eq "iso_8859_6") {
179	# 8 bit Arabic (IOS-8859-6)
180	return undef if (eof ($handle));
181	return &unicode::unicode2utf8(&unicode::arabic2unicode (getc ($handle)));
182	}
183
184	if ($self->{'encoding'} eq "windows_1256") {
185	# Windows 1256 (Arabic)
186	return undef if (eof ($handle));
187	return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", getc ($handle)));
188	}
189
190	# unknown encoding
191	return undef;
192	}
193
194
195	# undef will be returned if the eof has been reached
196	# the result will always be returned in utf-8
197	sub read_line {
198	my $self = shift (@_);
199
200	# make sure we have a file handle
201	return undef if ($self->{'handle'} eq "");
202
203	my $handle = $self->{'handle'};
204
205	if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
206	# special case for the first line of utf8 text to detect whether
207	# the file is in utf8 or unicode
208	my $out = "";
209	my $thisc = "";
210	while (defined ($thisc = $self->read_char())) {
211	$out .= $thisc;
212	last if ($thisc eq "\n");
213	}
214
215	return $out if (length ($out) > 0);
216	return undef;
217	}
218
219
220	if ($self->{'encoding'} eq "utf8") {
221	# utf-8 line
222	return <$handle>;
223	}
224
225	if ($self->{'encoding'} eq "unicode") {
226	# unicode line
227	my $c = "";
228	my ($c1, $c2) = ("", "");
229	my $out = "";
230	while (read ($handle, $c, 2) == 2) {
231	$c1 = ord (substr ($c, 0, 1));
232	$c2 = ord (substr ($c, 1, 1));
233	$c = &unicode::unicode2utf8([(($self->{'bigendian'}) ? ($c1256+$c2) : ($c2256+$c1))]);
234	$out .= $c;
235	last if ($c eq "\n");
236	}
237
238	return $out if (length ($out) > 0);
239	return undef;
240	}
241
242	if ($self->{'encoding'} eq "gb") {
243	# GB or GBK
244	my $line = "";
245	if (defined ($line = <$handle>)) {
246	return &unicode::unicode2utf8 (&gb::gb2unicode ($line));
247	}
248	return undef;
249	}
250
251	if ($self->{'encoding'} eq "iso_8859_1") {
252	# extended ascii (ISO-8859-1)
253	my $line = "";
254	if (defined ($line = <$handle>)) {
255	return &unicode::ascii2utf8 ($line);
256	}
257	return undef;
258	}
259
260	if ($self->{'encoding'} eq "iso_8859_6") {
261	# 8 bit arabic (ISO-8859-6)
262	my $line = "";
263	if (defined ($line = <$handle>)) {
264	return &unicode::unicode2utf8(&unicode::arabic2unicode ($line));
265	}
266	return undef;
267	}
268
269	if ($self->{'encoding'} eq "windows_1256") {
270	# Windows 1256 (Arabic)
271	my $line = "";
272	if (defined ($line = <$handle>)) {
273	return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $line));
274	}
275	return undef;
276	}
277
278	# unknown encoding
279	return undef;
280	}
281
282
283	# will convert entire contents of file to utf8 and append result to $outputref
284	# this may be a slightly faster way to get the contents of a file than by
285	# recursively calling read_line()
286	sub read_file {
287	my $self = shift (@_);
288	my ($outputref) = @_;
289
290	# make sure we have a file handle
291	return if ($self->{'handle'} eq "");
292
293	my $handle = $self->{'handle'};
294
295	if ($self->{'first'} && $self->{'encoding'} eq "utf8") {
296	# special case for the first line of utf8 text to detect whether
297	# the file is in utf8 or unicode
298	$$text .= $self->read_line ();
299	}
300
301	if ($self->{'encoding'} eq "utf8") {
302	undef $/;
303	$$outputref .= <$handle>;
304	$/ = "\n";
305	return;
306	}
307
308	if ($self->{'encoding'} eq "unicode") {
309	my $line = "";
310	while (defined ($line = $self->read_line())) {
311	$$outputref .= $line;
312	}
313	return;
314	}
315
316	if ($self->{'encoding'} eq "gb") {
317	undef $/;
318	my $text = <$handle>;
319	$/ = "\n";
320	$$outputref .= &unicode::unicode2utf8 (&gb::gb2unicode ($text));
321	return;
322	}
323
324	if ($self->{'encoding'} eq "iso_8859_1") {
325	undef $/;
326	my $text = <$handle>;
327	$/ = "\n";
328	$$outputref .= &unicode::ascii2utf8 ($text);
329	return;
330	}
331
332	if ($self->{'encoding'} eq "iso_8859_6") {
333	my $text = <$handle>;
334	undef $/;
335	$/ = "\n";
336	$$outputref .= &unicode::unicode2utf8(&unicode::arabic2unicode ($text));
337	return;
338	}
339
340	if ($self->{'encoding'} eq "windows_1256") {
341	undef $/;
342	my $text = <$handle>;
343	$/ = "\n";
344	$$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $text));
345	return;
346	}
347	}
348
349
350	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: