Context Navigation

source: trunk/gsdl/bin/script/touc.pl@ 1587

Last change on this file since 1587 was 1227, checked in by sjboddie, 24 years ago
Modified the perl code for importing arabic encoded documents. Plugins now support a windows_1256 and an iso_8859_6 encoding. I was briefly under the impression that these two encodings were similar enough to be treated the same. It turns out they're not. It appears that the Windows codepage 1256 is the most commonly used Arabic encoding so "arabic" is a synonym for windows_1256.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 2.4 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# touc.pl -- converts to unicode
6	#
7	# Copyright (C) 1999 DigiLib Systems Limited, NZ.
8	#
9	# This program is free software; you can redistribute it and/or modify
10	# it under the terms of the GNU General Public License as published by
11	# the Free Software Foundation; either version 2 of the License, or
12	# (at your option) any later version.
13	#
14	# This program is distributed in the hope that it will be useful,
15	# but WITHOUT ANY WARRANTY; without even the implied warranty of
16	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17	# GNU General Public License for more details.
18	#
19	# You should have received a copy of the GNU General Public License
20	# along with this program; if not, write to the Free Software
21	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22	#
23	###########################################################################
24
25	BEGIN {
26	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
27	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
28	}
29
30	use unicode;
31	use multiread;
32	use parsargv;
33
34
35	$encoding = "utf8";
36	if (!parsargv::parse(\@ARGV,
37	'unicode', \$unicode,
38	'iso_8859_1', \$iso_8859_1,
39	'iso_8859_6', \$iso_8859_6,
40	'windows_1256', \$windows_1256,
41	'gb', \$gb)) {
42	print STDERR "\n usage: $0 [options]\n\n";
43	print STDERR " options:\n";
44	print STDERR " -unicode input is in utf-8 or unicode (default)\n";
45	print STDERR " -iso_8859_1 input is in extended ascii (ISO-8859-1 Latin 1)\n";
46	print STDERR " -iso_8859_6 input is in 8 bit Arabic (ISO-8859-6)\n";
47	print STDERR " -windows_1256 input is in Windows 1256 (Arabic)\n";
48	print STDERR " -gb input is in GB or GBK (simplified Chinese)\n\n";
49	die "\n";
50	}
51
52	$encoding = "utf8" if $unicode;
53	$encoding = "iso_8859_1" if $iso_8859_1;
54	$encoding = "iso_8859_6" if $iso_8859_6;
55	$encoding = "windows_1256" if $windows_1256;
56	$encoding = "gb" if $gb;
57
58
59	if ($ENV{'GSDLOS'} =~ /windows/i) {
60	binmode (STDOUT); # silly windows
61	}
62
63	# output in little endian
64	print "\xff\xfe";
65
66	$reader = new multiread ();
67	$reader->set_handle ('main::STDIN');
68	$reader->set_encoding ($encoding);
69	$line = "";
70	$ucline = [];
71	while (defined ($line = $reader->read_line())) {
72	$ucline = &unicode::utf82unicode ($line);
73	foreach $c (@$ucline) {
74	$c1 = chr (int ($c / 256));
75	$c2 = chr (int ($c % 256));
76	print "$c2$c1";
77	}
78	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: