1 | #!/usr/bin/perl -w
|
---|
2 |
|
---|
3 | ###########################################################################
|
---|
4 | #
|
---|
5 | # makemapfile.pl --
|
---|
6 | # A component of the Greenstone digital library software
|
---|
7 | # from the New Zealand Digital Library Project at the
|
---|
8 | # University of Waikato, New Zealand.
|
---|
9 | #
|
---|
10 | # Copyright (C) 2001 New Zealand Digital Library Project
|
---|
11 | #
|
---|
12 | # This program is free software; you can redistribute it and/or modify
|
---|
13 | # it under the terms of the GNU General Public License as published by
|
---|
14 | # the Free Software Foundation; either version 2 of the License, or
|
---|
15 | # (at your option) any later version.
|
---|
16 | #
|
---|
17 | # This program is distributed in the hope that it will be useful,
|
---|
18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
20 | # GNU General Public License for more details.
|
---|
21 | #
|
---|
22 | # You should have received a copy of the GNU General Public License
|
---|
23 | # along with this program; if not, write to the Free Software
|
---|
24 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
25 | #
|
---|
26 | ###########################################################################
|
---|
27 |
|
---|
28 | BEGIN {
|
---|
29 | die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
|
---|
30 | die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
|
---|
31 | unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
|
---|
32 | }
|
---|
33 |
|
---|
34 | use parsargv;
|
---|
35 | use util;
|
---|
36 |
|
---|
37 | # %translations is of the form:
|
---|
38 | #
|
---|
39 | # encodings{encodingname-encodingname}->blocktranslation
|
---|
40 | # blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
|
---|
41 | #
|
---|
42 | # Any of the top translation blocks can point to an undefined
|
---|
43 | # value. This data structure aims to allow fast translation and
|
---|
44 | # efficient storage.
|
---|
45 | %translations = ();
|
---|
46 |
|
---|
47 | # @array256 is used for initialisation, there must be
|
---|
48 | # a better way...
|
---|
49 | @array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
---|
50 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
---|
51 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
---|
52 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
---|
53 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
---|
54 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
---|
55 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
---|
56 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
---|
57 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
---|
58 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
---|
59 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
---|
60 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
---|
61 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
---|
62 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
---|
63 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
---|
64 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
|
---|
65 |
|
---|
66 | &main();
|
---|
67 |
|
---|
68 | sub print_usage {
|
---|
69 | print STDERR "\n";
|
---|
70 | print STDERR "makemapfile.pl: Creates unicode map (.ump) files from plain\n";
|
---|
71 | print STDERR " text code pages.\n\n";
|
---|
72 | print STDERR " usage: $0 [options]\n\n";
|
---|
73 | print STDERR " options:\n";
|
---|
74 | print STDERR " -encoding name\n";
|
---|
75 | print STDERR " -mapfile text file from which to create binary ump file\n\n";
|
---|
76 | }
|
---|
77 |
|
---|
78 | sub main {
|
---|
79 | if (!parsargv::parse(\@ARGV,
|
---|
80 | 'encoding/.+', \$encoding,
|
---|
81 | 'mapfile/.+', \$mapfile)) {
|
---|
82 | &print_usage();
|
---|
83 | die "\n";
|
---|
84 | }
|
---|
85 |
|
---|
86 | if (!&loadencoding ($encoding, $mapfile)) {
|
---|
87 | die "couldn't load encoding $encoding";
|
---|
88 | }
|
---|
89 |
|
---|
90 | # write out map files
|
---|
91 | &writemapfile ("$encoding-unicode", $encoding, 1);
|
---|
92 | &writemapfile ("unicode-$encoding", $encoding, 0);
|
---|
93 | }
|
---|
94 |
|
---|
95 | sub writemapfile {
|
---|
96 | my ($encoding, $filename, $tounicode) = @_;
|
---|
97 |
|
---|
98 | $filename .= ".ump"; # unicode map file
|
---|
99 | if ($tounicode) {
|
---|
100 | $filename = &util::filename_cat ($ENV{'GSDLHOME'}, "mappings", "to_uc", $filename);
|
---|
101 | } else {
|
---|
102 | $filename = &util::filename_cat ($ENV{'GSDLHOME'}, "mappings", "from_uc", $filename);
|
---|
103 | }
|
---|
104 |
|
---|
105 | die "translation not defined" if (!defined $translations{$encoding});
|
---|
106 | my $block = $translations{$encoding};
|
---|
107 |
|
---|
108 | print "writing $filename\n";
|
---|
109 | open (MAPFILE, ">" . $filename) || die;
|
---|
110 | binmode (MAPFILE);
|
---|
111 |
|
---|
112 | my ($i, $j);
|
---|
113 | for ($i=0; $i<256; $i++) {
|
---|
114 | if (ref ($block->[$i]) eq "ARRAY") {
|
---|
115 | print MAPFILE pack ("C", $i);
|
---|
116 | for ($j=0; $j<256; $j++) {
|
---|
117 | # unsigned short in network order
|
---|
118 | print MAPFILE pack ("CC", int($block->[$i]->[$j] / 256),
|
---|
119 | $block->[$i]->[$j] % 256);
|
---|
120 | }
|
---|
121 | }
|
---|
122 | }
|
---|
123 | close (MAPFILE);
|
---|
124 | }
|
---|
125 |
|
---|
126 | # loadencoding expects the mapfile to contain (at least) two
|
---|
127 | # tab-separated fields. The first field is the mapped value
|
---|
128 | # and the second field is the unicode value.
|
---|
129 | #
|
---|
130 | # It returns 1 if successful, 0 if unsuccessful
|
---|
131 | sub loadencoding {
|
---|
132 | my ($encoding, $mapfile) = @_;
|
---|
133 |
|
---|
134 | my $to = "$encoding-unicode";
|
---|
135 | my $from = "unicode-$encoding";
|
---|
136 |
|
---|
137 | # check to see if the encoding has already been loaded
|
---|
138 | if (defined $translations{$to} && defined $translations{$from}) {
|
---|
139 | return 1;
|
---|
140 | }
|
---|
141 |
|
---|
142 | return 0 unless open (MAPFILE, $mapfile);
|
---|
143 |
|
---|
144 | my ($line, @line);
|
---|
145 | $translations{$to} = [@array256];
|
---|
146 | $translations{$from} = [@array256];
|
---|
147 | while (defined ($line = <MAPFILE>)) {
|
---|
148 | chomp $line;
|
---|
149 | # remove comments
|
---|
150 | $line =~ s/\#.*$//;
|
---|
151 | next unless $line =~ /\S/;
|
---|
152 |
|
---|
153 | # split the line into fields and do a few
|
---|
154 | # simple sanity checks
|
---|
155 | @line = split (/\t/, $line);
|
---|
156 | next unless (scalar(@line) >= 2 &&
|
---|
157 | $line[0] =~ /^0x/ &&
|
---|
158 | $line[1] =~ /^0x/);
|
---|
159 |
|
---|
160 | my $char = hex($line[0]);
|
---|
161 | my $unic = hex($line[1]);
|
---|
162 |
|
---|
163 | # might need this for some versions of gb but not gbk
|
---|
164 | # $char = $char | 0x8080 unless ($encoding =~ /gbk/i);
|
---|
165 |
|
---|
166 | &addchartrans ($translations{$to}, $char, $unic);
|
---|
167 | &addchartrans ($translations{$from}, $unic, $char);
|
---|
168 | }
|
---|
169 |
|
---|
170 | close (MAPFILE);
|
---|
171 |
|
---|
172 | return 1;
|
---|
173 | }
|
---|
174 |
|
---|
175 | # addchartrans adds one character translation to a translation block.
|
---|
176 | # It also simplifies the translation block if possible.
|
---|
177 | sub addchartrans {
|
---|
178 | my ($block, $from, $to) = @_;
|
---|
179 | my $i = 0;
|
---|
180 |
|
---|
181 | my $high = ($from / 256) % 256;
|
---|
182 | my $low = $from % 256;
|
---|
183 |
|
---|
184 | if (ref ($block->[$high]) ne "ARRAY") {
|
---|
185 | $block->[$high] = [@array256];
|
---|
186 | }
|
---|
187 | $block->[$high]->[$low] = $to;
|
---|
188 | }
|
---|