1 | @rem = '--*-Perl-*--
|
---|
2 | @echo off
|
---|
3 | if "%OS%" == "Windows_NT" goto WinNT
|
---|
4 | perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9
|
---|
5 | goto endofperl
|
---|
6 | :WinNT
|
---|
7 | perl -x -S %0 %*
|
---|
8 | if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl
|
---|
9 | if %errorlevel% == 9009 echo You do not have Perl in your PATH.
|
---|
10 | if errorlevel 1 goto script_failed_so_exit_with_non_zero_val 2>nul
|
---|
11 | goto endofperl
|
---|
12 | @rem ';
|
---|
13 | #!perl
|
---|
14 | #line 15
|
---|
15 | eval 'exec c:\shaoqunWu\perl\bin\perl.exe -S $0 ${1+"$@"}'
|
---|
16 | if $running_under_some_shell;
|
---|
17 | #!./perl
|
---|
18 | BEGIN {
|
---|
19 | # @INC poking no longer needed w/ new MakeMaker and Makefile.PL's
|
---|
20 | # with $ENV{PERL_CORE} set
|
---|
21 | # In case we need it in future...
|
---|
22 | require Config; import Config;
|
---|
23 | }
|
---|
24 | use strict;
|
---|
25 | use warnings;
|
---|
26 | use Getopt::Std;
|
---|
27 | my @orig_ARGV = @ARGV;
|
---|
28 | our $VERSION = do { my @r = (q$Revision: 2.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
|
---|
29 |
|
---|
30 | # These may get re-ordered.
|
---|
31 | # RAW is a do_now as inserted by &enter
|
---|
32 | # AGG is an aggreagated do_now, as built up by &process
|
---|
33 |
|
---|
34 | use constant {
|
---|
35 | RAW_NEXT => 0,
|
---|
36 | RAW_IN_LEN => 1,
|
---|
37 | RAW_OUT_BYTES => 2,
|
---|
38 | RAW_FALLBACK => 3,
|
---|
39 |
|
---|
40 | AGG_MIN_IN => 0,
|
---|
41 | AGG_MAX_IN => 1,
|
---|
42 | AGG_OUT_BYTES => 2,
|
---|
43 | AGG_NEXT => 3,
|
---|
44 | AGG_IN_LEN => 4,
|
---|
45 | AGG_OUT_LEN => 5,
|
---|
46 | AGG_FALLBACK => 6,
|
---|
47 | };
|
---|
48 |
|
---|
49 | # (See the algorithm in encengine.c - we're building structures for it)
|
---|
50 |
|
---|
51 | # There are two sorts of structures.
|
---|
52 | # "do_now" (an array, two variants of what needs storing) is whatever we need
|
---|
53 | # to do now we've read an input byte.
|
---|
54 | # It's housed in a "do_next" (which is how we got to it), and in turn points
|
---|
55 | # to a "do_next" which contains all the "do_now"s for the next input byte.
|
---|
56 |
|
---|
57 | # There will be a "do_next" which is the start state.
|
---|
58 | # For a single byte encoding it's the only "do_next" - each "do_now" points
|
---|
59 | # back to it, and each "do_now" will cause bytes. There is no state.
|
---|
60 |
|
---|
61 | # For a multi-byte encoding where all characters in the input are the same
|
---|
62 | # length, then there will be a tree of "do_now"->"do_next"->"do_now"
|
---|
63 | # branching out from the start state, one step for each input byte.
|
---|
64 | # The leaf "do_now"s will all be at the same distance from the start state,
|
---|
65 | # only the leaf "do_now"s cause output bytes, and they in turn point back to
|
---|
66 | # the start state.
|
---|
67 |
|
---|
68 | # For an encoding where there are varaible length input byte sequences, you
|
---|
69 | # will encounter a leaf "do_now" sooner for the shorter input sequences, but
|
---|
70 | # as before the leaves will point back to the start state.
|
---|
71 |
|
---|
72 | # The system will cope with escape encodings (imagine them as a mostly
|
---|
73 | # self-contained tree for each escape state, and cross links between trees
|
---|
74 | # at the state-switching characters) but so far no input format defines these.
|
---|
75 |
|
---|
76 | # The system will also cope with having output "leaves" in the middle of
|
---|
77 | # the bifurcating branches, not just at the extremities, but again no
|
---|
78 | # input format does this yet.
|
---|
79 |
|
---|
80 | # There are two variants of the "do_now" structure. The first, smaller variant
|
---|
81 | # is generated by &enter as the input file is read. There is one structure
|
---|
82 | # for each input byte. Say we are mapping a single byte encoding to a
|
---|
83 | # single byte encoding, with "ABCD" going "abcd". There will be
|
---|
84 | # 4 "do_now"s, {"A" => [...,"a",...], "B" => [...,"b",...], "C"=>..., "D"=>...}
|
---|
85 |
|
---|
86 | # &process then walks the tree, building aggregate "do_now" structres for
|
---|
87 | # adjacent bytes where possible. The aggregate is for a contiguous range of
|
---|
88 | # bytes which each produce the same length of output, each move to the
|
---|
89 | # same next state, and each have the same fallback flag.
|
---|
90 | # So our 4 RAW "do_now"s above become replaced by a single structure
|
---|
91 | # containing:
|
---|
92 | # ["A", "D", "abcd", 1, ...]
|
---|
93 | # ie, for an input byte $_ in "A".."D", output 1 byte, found as
|
---|
94 | # substr ("abcd", (ord $_ - ord "A") * 1, 1)
|
---|
95 | # which maps very nicely into pointer arithmetic in C for encengine.c
|
---|
96 |
|
---|
97 | sub encode_U
|
---|
98 | {
|
---|
99 | # UTF-8 encode long hand - only covers part of perl's range
|
---|
100 | ## my $uv = shift;
|
---|
101 | # chr() works in native space so convert value from table
|
---|
102 | # into that space before using chr().
|
---|
103 | my $ch = chr(utf8::unicode_to_native($_[0]));
|
---|
104 | # Now get core perl to encode that the way it likes.
|
---|
105 | utf8::encode($ch);
|
---|
106 | return $ch;
|
---|
107 | }
|
---|
108 |
|
---|
109 | sub encode_S
|
---|
110 | {
|
---|
111 | # encode single byte
|
---|
112 | ## my ($ch,$page) = @_; return chr($ch);
|
---|
113 | return chr $_[0];
|
---|
114 | }
|
---|
115 |
|
---|
116 | sub encode_D
|
---|
117 | {
|
---|
118 | # encode double byte MS byte first
|
---|
119 | ## my ($ch,$page) = @_; return chr($page).chr($ch);
|
---|
120 | return chr ($_[1]) . chr $_[0];
|
---|
121 | }
|
---|
122 |
|
---|
123 | sub encode_M
|
---|
124 | {
|
---|
125 | # encode Multi-byte - single for 0..255 otherwise double
|
---|
126 | ## my ($ch,$page) = @_;
|
---|
127 | ## return &encode_D if $page;
|
---|
128 | ## return &encode_S;
|
---|
129 | return chr ($_[1]) . chr $_[0] if $_[1];
|
---|
130 | return chr $_[0];
|
---|
131 | }
|
---|
132 |
|
---|
133 | my %encode_types = (U => \&encode_U,
|
---|
134 | S => \&encode_S,
|
---|
135 | D => \&encode_D,
|
---|
136 | M => \&encode_M,
|
---|
137 | );
|
---|
138 |
|
---|
139 | # Win32 does not expand globs on command line
|
---|
140 | eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32');
|
---|
141 |
|
---|
142 | my %opt;
|
---|
143 | # I think these are:
|
---|
144 | # -Q to disable the duplicate codepoint test
|
---|
145 | # -S make mapping errors fatal
|
---|
146 | # -q to remove comments written to output files
|
---|
147 | # -O to enable the (brute force) substring optimiser
|
---|
148 | # -o <output> to specify the output file name (else it's the first arg)
|
---|
149 | # -f <inlist> to give a file with a list of input files (else use the args)
|
---|
150 | # -n <name> to name the encoding (else use the basename of the input file.
|
---|
151 | getopts('CM:SQqOo:f:n:',\%opt);
|
---|
152 |
|
---|
153 | $opt{M} and make_makefile_pl($opt{M}, @ARGV);
|
---|
154 | $opt{C} and make_configlocal_pm($opt{C}, @ARGV);
|
---|
155 |
|
---|
156 | # This really should go first, else the die here causes empty (non-erroneous)
|
---|
157 | # output files to be written.
|
---|
158 | my @encfiles;
|
---|
159 | if (exists $opt{'f'}) {
|
---|
160 | # -F is followed by name of file containing list of filenames
|
---|
161 | my $flist = $opt{'f'};
|
---|
162 | open(FLIST,$flist) || die "Cannot open $flist:$!";
|
---|
163 | chomp(@encfiles = <FLIST>);
|
---|
164 | close(FLIST);
|
---|
165 | } else {
|
---|
166 | @encfiles = @ARGV;
|
---|
167 | }
|
---|
168 |
|
---|
169 | my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV);
|
---|
170 | chmod(0666,$cname) if -f $cname && !-w $cname;
|
---|
171 | open(C,">$cname") || die "Cannot open $cname:$!";
|
---|
172 |
|
---|
173 | my $dname = $cname;
|
---|
174 | my $hname = $cname;
|
---|
175 |
|
---|
176 | my ($doC,$doEnc,$doUcm,$doPet);
|
---|
177 |
|
---|
178 | if ($cname =~ /\.(c|xs)$/i) # VMS may have upcased filenames with DECC$ARGV_PARSE_STYLE defined
|
---|
179 | {
|
---|
180 | $doC = 1;
|
---|
181 | $dname =~ s/(\.[^\.]*)?$/.exh/;
|
---|
182 | chmod(0666,$dname) if -f $cname && !-w $dname;
|
---|
183 | open(D,">$dname") || die "Cannot open $dname:$!";
|
---|
184 | $hname =~ s/(\.[^\.]*)?$/.h/;
|
---|
185 | chmod(0666,$hname) if -f $cname && !-w $hname;
|
---|
186 | open(H,">$hname") || die "Cannot open $hname:$!";
|
---|
187 |
|
---|
188 | foreach my $fh (\*C,\*D,\*H)
|
---|
189 | {
|
---|
190 | print $fh <<"END" unless $opt{'q'};
|
---|
191 | /*
|
---|
192 | !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
|
---|
193 | This file was autogenerated by:
|
---|
194 | $^X $0 @orig_ARGV
|
---|
195 | */
|
---|
196 | END
|
---|
197 | }
|
---|
198 |
|
---|
199 | if ($cname =~ /(\w+)\.xs$/)
|
---|
200 | {
|
---|
201 | print C "#include <EXTERN.h>\n";
|
---|
202 | print C "#include <perl.h>\n";
|
---|
203 | print C "#include <XSUB.h>\n";
|
---|
204 | print C "#define U8 U8\n";
|
---|
205 | }
|
---|
206 | print C "#include \"encode.h\"\n\n";
|
---|
207 |
|
---|
208 | }
|
---|
209 | elsif ($cname =~ /\.enc$/)
|
---|
210 | {
|
---|
211 | $doEnc = 1;
|
---|
212 | }
|
---|
213 | elsif ($cname =~ /\.ucm$/)
|
---|
214 | {
|
---|
215 | $doUcm = 1;
|
---|
216 | }
|
---|
217 | elsif ($cname =~ /\.pet$/)
|
---|
218 | {
|
---|
219 | $doPet = 1;
|
---|
220 | }
|
---|
221 |
|
---|
222 | my %encoding;
|
---|
223 | my %strings;
|
---|
224 | my $string_acc;
|
---|
225 | my %strings_in_acc;
|
---|
226 |
|
---|
227 | my $saved = 0;
|
---|
228 | my $subsave = 0;
|
---|
229 | my $strings = 0;
|
---|
230 |
|
---|
231 | sub cmp_name
|
---|
232 | {
|
---|
233 | if ($a =~ /^.*-(\d+)/)
|
---|
234 | {
|
---|
235 | my $an = $1;
|
---|
236 | if ($b =~ /^.*-(\d+)/)
|
---|
237 | {
|
---|
238 | my $r = $an <=> $1;
|
---|
239 | return $r if $r;
|
---|
240 | }
|
---|
241 | }
|
---|
242 | return $a cmp $b;
|
---|
243 | }
|
---|
244 |
|
---|
245 |
|
---|
246 | foreach my $enc (sort cmp_name @encfiles)
|
---|
247 | {
|
---|
248 | my ($name,$sfx) = $enc =~ /^.*?([\w-]+)\.(enc|ucm)$/;
|
---|
249 | $name = $opt{'n'} if exists $opt{'n'};
|
---|
250 | if (open(E,$enc))
|
---|
251 | {
|
---|
252 | if ($sfx eq 'enc')
|
---|
253 | {
|
---|
254 | compile_enc(\*E,lc($name));
|
---|
255 | }
|
---|
256 | else
|
---|
257 | {
|
---|
258 | compile_ucm(\*E,lc($name));
|
---|
259 | }
|
---|
260 | }
|
---|
261 | else
|
---|
262 | {
|
---|
263 | warn "Cannot open $enc for $name:$!";
|
---|
264 | }
|
---|
265 | }
|
---|
266 |
|
---|
267 | if ($doC)
|
---|
268 | {
|
---|
269 | print STDERR "Writing compiled form\n";
|
---|
270 | foreach my $name (sort cmp_name keys %encoding)
|
---|
271 | {
|
---|
272 | my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
|
---|
273 | process($name.'_utf8',$e2u);
|
---|
274 | addstrings(\*C,$e2u);
|
---|
275 |
|
---|
276 | process('utf8_'.$name,$u2e);
|
---|
277 | addstrings(\*C,$u2e);
|
---|
278 | }
|
---|
279 | outbigstring(\*C,"enctable");
|
---|
280 | foreach my $name (sort cmp_name keys %encoding)
|
---|
281 | {
|
---|
282 | my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
|
---|
283 | outtable(\*C,$e2u, "enctable");
|
---|
284 | outtable(\*C,$u2e, "enctable");
|
---|
285 |
|
---|
286 | # push(@{$encoding{$name}},outstring(\*C,$e2u->{Cname}.'_def',$erep));
|
---|
287 | }
|
---|
288 | foreach my $enc (sort cmp_name keys %encoding)
|
---|
289 | {
|
---|
290 | # my ($e2u,$u2e,$rep,$min_el,$max_el,$rsym) = @{$encoding{$enc}};
|
---|
291 | my ($e2u,$u2e,$rep,$min_el,$max_el) = @{$encoding{$enc}};
|
---|
292 | #my @info = ($e2u->{Cname},$u2e->{Cname},$rsym,length($rep),$min_el,$max_el);
|
---|
293 | my $replen = 0;
|
---|
294 | $replen++ while($rep =~ /\G\\x[0-9A-Fa-f]/g);
|
---|
295 | my @info = ($e2u->{Cname},$u2e->{Cname},qq((U8 *)"$rep"),$replen,$min_el,$max_el);
|
---|
296 | my $sym = "${enc}_encoding";
|
---|
297 | $sym =~ s/\W+/_/g;
|
---|
298 | print C "encode_t $sym = \n";
|
---|
299 | # This is to make null encoding work -- dankogai
|
---|
300 | for (my $i = (scalar @info) - 1; $i >= 0; --$i){
|
---|
301 | $info[$i] ||= 1;
|
---|
302 | }
|
---|
303 | # end of null tweak -- dankogai
|
---|
304 | print C " {",join(',',@info,"{\"$enc\",(const char *)0}"),"};\n\n";
|
---|
305 | }
|
---|
306 |
|
---|
307 | foreach my $enc (sort cmp_name keys %encoding)
|
---|
308 | {
|
---|
309 | my $sym = "${enc}_encoding";
|
---|
310 | $sym =~ s/\W+/_/g;
|
---|
311 | print H "extern encode_t $sym;\n";
|
---|
312 | print D " Encode_XSEncoding(aTHX_ &$sym);\n";
|
---|
313 | }
|
---|
314 |
|
---|
315 | if ($cname =~ /(\w+)\.xs$/)
|
---|
316 | {
|
---|
317 | my $mod = $1;
|
---|
318 | print C <<'END';
|
---|
319 |
|
---|
320 | static void
|
---|
321 | Encode_XSEncoding(pTHX_ encode_t *enc)
|
---|
322 | {
|
---|
323 | dSP;
|
---|
324 | HV *stash = gv_stashpv("Encode::XS", TRUE);
|
---|
325 | SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
|
---|
326 | int i = 0;
|
---|
327 | PUSHMARK(sp);
|
---|
328 | XPUSHs(sv);
|
---|
329 | while (enc->name[i])
|
---|
330 | {
|
---|
331 | const char *name = enc->name[i++];
|
---|
332 | XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
|
---|
333 | }
|
---|
334 | PUTBACK;
|
---|
335 | call_pv("Encode::define_encoding",G_DISCARD);
|
---|
336 | SvREFCNT_dec(sv);
|
---|
337 | }
|
---|
338 |
|
---|
339 | END
|
---|
340 |
|
---|
341 | print C "\nMODULE = Encode::$mod\tPACKAGE = Encode::$mod\n\n";
|
---|
342 | print C "BOOT:\n{\n";
|
---|
343 | print C "#include \"$dname\"\n";
|
---|
344 | print C "}\n";
|
---|
345 | }
|
---|
346 | # Close in void context is bad, m'kay
|
---|
347 | close(D) or warn "Error closing '$dname': $!";
|
---|
348 | close(H) or warn "Error closing '$hname': $!";
|
---|
349 |
|
---|
350 | my $perc_saved = $saved/($strings + $saved) * 100;
|
---|
351 | my $perc_subsaved = $subsave/($strings + $subsave) * 100;
|
---|
352 | printf STDERR "%d bytes in string tables\n",$strings;
|
---|
353 | printf STDERR "%d bytes (%.3g%%) saved spotting duplicates\n",
|
---|
354 | $saved, $perc_saved if $saved;
|
---|
355 | printf STDERR "%d bytes (%.3g%%) saved using substrings\n",
|
---|
356 | $subsave, $perc_subsaved if $subsave;
|
---|
357 | }
|
---|
358 | elsif ($doEnc)
|
---|
359 | {
|
---|
360 | foreach my $name (sort cmp_name keys %encoding)
|
---|
361 | {
|
---|
362 | my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
|
---|
363 | output_enc(\*C,$name,$e2u);
|
---|
364 | }
|
---|
365 | }
|
---|
366 | elsif ($doUcm)
|
---|
367 | {
|
---|
368 | foreach my $name (sort cmp_name keys %encoding)
|
---|
369 | {
|
---|
370 | my ($e2u,$u2e,$erep,$min_el,$max_el) = @{$encoding{$name}};
|
---|
371 | output_ucm(\*C,$name,$u2e,$erep,$min_el,$max_el);
|
---|
372 | }
|
---|
373 | }
|
---|
374 |
|
---|
375 | # writing half meg files and then not checking to see if you just filled the
|
---|
376 | # disk is bad, m'kay
|
---|
377 | close(C) or die "Error closing '$cname': $!";
|
---|
378 |
|
---|
379 | # End of the main program.
|
---|
380 |
|
---|
381 | sub compile_ucm
|
---|
382 | {
|
---|
383 | my ($fh,$name) = @_;
|
---|
384 | my $e2u = {};
|
---|
385 | my $u2e = {};
|
---|
386 | my $cs;
|
---|
387 | my %attr;
|
---|
388 | while (<$fh>)
|
---|
389 | {
|
---|
390 | s/#.*$//;
|
---|
391 | last if /^\s*CHARMAP\s*$/i;
|
---|
392 | if (/^\s*<(\w+)>\s+"?([^"]*)"?\s*$/i) # " # Grrr
|
---|
393 | {
|
---|
394 | $attr{$1} = $2;
|
---|
395 | }
|
---|
396 | }
|
---|
397 | if (!defined($cs = $attr{'code_set_name'}))
|
---|
398 | {
|
---|
399 | warn "No <code_set_name> in $name\n";
|
---|
400 | }
|
---|
401 | else
|
---|
402 | {
|
---|
403 | $name = $cs unless exists $opt{'n'};
|
---|
404 | }
|
---|
405 | my $erep;
|
---|
406 | my $urep;
|
---|
407 | my $max_el;
|
---|
408 | my $min_el;
|
---|
409 | if (exists $attr{'subchar'})
|
---|
410 | {
|
---|
411 | #my @byte;
|
---|
412 | #$attr{'subchar'} =~ /^\s*/cg;
|
---|
413 | #push(@byte,$1) while $attr{'subchar'} =~ /\G\\x([0-9a-f]+)/icg;
|
---|
414 | #$erep = join('',map(chr(hex($_)),@byte));
|
---|
415 | $erep = $attr{'subchar'};
|
---|
416 | $erep =~ s/^\s+//; $erep =~ s/\s+$//;
|
---|
417 | }
|
---|
418 | print "Reading $name ($cs)\n";
|
---|
419 | my $nfb = 0;
|
---|
420 | my $hfb = 0;
|
---|
421 | while (<$fh>)
|
---|
422 | {
|
---|
423 | s/#.*$//;
|
---|
424 | last if /^\s*END\s+CHARMAP\s*$/i;
|
---|
425 | next if /^\s*$/;
|
---|
426 | my (@uni, @byte) = ();
|
---|
427 | my ($uni, $byte, $fb) = m/^(\S+)\s+(\S+)\s+(\S+)\s+/o
|
---|
428 | or die "Bad line: $_";
|
---|
429 | while ($uni =~ m/\G<([U0-9a-fA-F\+]+)>/g){
|
---|
430 | push @uni, map { substr($_, 1) } split(/\+/, $1);
|
---|
431 | }
|
---|
432 | while ($byte =~ m/\G\\x([0-9a-fA-F]+)/g){
|
---|
433 | push @byte, $1;
|
---|
434 | }
|
---|
435 | if (@uni)
|
---|
436 | {
|
---|
437 | my $uch = join('', map { encode_U(hex($_)) } @uni );
|
---|
438 | my $ech = join('',map(chr(hex($_)),@byte));
|
---|
439 | my $el = length($ech);
|
---|
440 | $max_el = $el if (!defined($max_el) || $el > $max_el);
|
---|
441 | $min_el = $el if (!defined($min_el) || $el < $min_el);
|
---|
442 | if (length($fb))
|
---|
443 | {
|
---|
444 | $fb = substr($fb,1);
|
---|
445 | $hfb++;
|
---|
446 | }
|
---|
447 | else
|
---|
448 | {
|
---|
449 | $nfb++;
|
---|
450 | $fb = '0';
|
---|
451 | }
|
---|
452 | # $fb is fallback flag
|
---|
453 | # 0 - round trip safe
|
---|
454 | # 1 - fallback for unicode -> enc
|
---|
455 | # 2 - skip sub-char mapping
|
---|
456 | # 3 - fallback enc -> unicode
|
---|
457 | enter($u2e,$uch,$ech,$u2e,$fb+0) if ($fb =~ /[01]/);
|
---|
458 | enter($e2u,$ech,$uch,$e2u,$fb+0) if ($fb =~ /[03]/);
|
---|
459 | }
|
---|
460 | else
|
---|
461 | {
|
---|
462 | warn $_;
|
---|
463 | }
|
---|
464 | }
|
---|
465 | if ($nfb && $hfb)
|
---|
466 | {
|
---|
467 | die "$nfb entries without fallback, $hfb entries with\n";
|
---|
468 | }
|
---|
469 | $encoding{$name} = [$e2u,$u2e,$erep,$min_el,$max_el];
|
---|
470 | }
|
---|
471 |
|
---|
472 |
|
---|
473 |
|
---|
474 | sub compile_enc
|
---|
475 | {
|
---|
476 | my ($fh,$name) = @_;
|
---|
477 | my $e2u = {};
|
---|
478 | my $u2e = {};
|
---|
479 |
|
---|
480 | my $type;
|
---|
481 | while ($type = <$fh>)
|
---|
482 | {
|
---|
483 | last if $type !~ /^\s*#/;
|
---|
484 | }
|
---|
485 | chomp($type);
|
---|
486 | return if $type eq 'E';
|
---|
487 | # Do the hash lookup once, rather than once per function call. 4% speedup.
|
---|
488 | my $type_func = $encode_types{$type};
|
---|
489 | my ($def,$sym,$pages) = split(/\s+/,scalar(<$fh>));
|
---|
490 | warn "$type encoded $name\n";
|
---|
491 | my $rep = '';
|
---|
492 | # Save a defined test by setting these to defined values.
|
---|
493 | my $min_el = ~0; # A very big integer
|
---|
494 | my $max_el = 0; # Anything must be longer than 0
|
---|
495 | {
|
---|
496 | my $v = hex($def);
|
---|
497 | $rep = &$type_func($v & 0xFF, ($v >> 8) & 0xffe);
|
---|
498 | }
|
---|
499 | my $errors;
|
---|
500 | my $seen;
|
---|
501 | # use -Q to silence the seen test. Makefile.PL uses this by default.
|
---|
502 | $seen = {} unless $opt{Q};
|
---|
503 | do
|
---|
504 | {
|
---|
505 | my $line = <$fh>;
|
---|
506 | chomp($line);
|
---|
507 | my $page = hex($line);
|
---|
508 | my $ch = 0;
|
---|
509 | my $i = 16;
|
---|
510 | do
|
---|
511 | {
|
---|
512 | # So why is it 1% faster to leave the my here?
|
---|
513 | my $line = <$fh>;
|
---|
514 | $line =~ s/\r\n$/\n/;
|
---|
515 | die "$.:${line}Line should be exactly 65 characters long including
|
---|
516 | newline (".length($line).")" unless length ($line) == 65;
|
---|
517 | # Split line into groups of 4 hex digits, convert groups to ints
|
---|
518 | # This takes 65.35
|
---|
519 | # map {hex $_} $line =~ /(....)/g
|
---|
520 | # This takes 63.75 (2.5% less time)
|
---|
521 | # unpack "n*", pack "H*", $line
|
---|
522 | # There's an implicit loop in map. Loops are bad, m'kay. Ops are bad, m'kay
|
---|
523 | # Doing it as while ($line =~ /(....)/g) took 74.63
|
---|
524 | foreach my $val (unpack "n*", pack "H*", $line)
|
---|
525 | {
|
---|
526 | next if $val == 0xFFFD;
|
---|
527 | my $ech = &$type_func($ch,$page);
|
---|
528 | if ($val || (!$ch && !$page))
|
---|
529 | {
|
---|
530 | my $el = length($ech);
|
---|
531 | $max_el = $el if $el > $max_el;
|
---|
532 | $min_el = $el if $el < $min_el;
|
---|
533 | my $uch = encode_U($val);
|
---|
534 | if ($seen) {
|
---|
535 | # We're doing the test.
|
---|
536 | # We don't need to read this quickly, so storing it as a scalar,
|
---|
537 | # rather than 3 (anon array, plus the 2 scalars it holds) saves
|
---|
538 | # RAM and may make us faster on low RAM systems. [see __END__]
|
---|
539 | if (exists $seen->{$uch})
|
---|
540 | {
|
---|
541 | warn sprintf("U%04X is %02X%02X and %04X\n",
|
---|
542 | $val,$page,$ch,$seen->{$uch});
|
---|
543 | $errors++;
|
---|
544 | }
|
---|
545 | else
|
---|
546 | {
|
---|
547 | $seen->{$uch} = $page << 8 | $ch;
|
---|
548 | }
|
---|
549 | }
|
---|
550 | # Passing 2 extra args each time is 3.6% slower!
|
---|
551 | # Even with having to add $fallback ||= 0 later
|
---|
552 | enter_fb0($e2u,$ech,$uch);
|
---|
553 | enter_fb0($u2e,$uch,$ech);
|
---|
554 | }
|
---|
555 | else
|
---|
556 | {
|
---|
557 | # No character at this position
|
---|
558 | # enter($e2u,$ech,undef,$e2u);
|
---|
559 | }
|
---|
560 | $ch++;
|
---|
561 | }
|
---|
562 | } while --$i;
|
---|
563 | } while --$pages;
|
---|
564 | die "\$min_el=$min_el, \$max_el=$max_el - seems we read no lines"
|
---|
565 | if $min_el > $max_el;
|
---|
566 | die "$errors mapping conflicts\n" if ($errors && $opt{'S'});
|
---|
567 | $encoding{$name} = [$e2u,$u2e,$rep,$min_el,$max_el];
|
---|
568 | }
|
---|
569 |
|
---|
570 | # my ($a,$s,$d,$t,$fb) = @_;
|
---|
571 | sub enter {
|
---|
572 | my ($current,$inbytes,$outbytes,$next,$fallback) = @_;
|
---|
573 | # state we shift to after this (multibyte) input character defaults to same
|
---|
574 | # as current state.
|
---|
575 | $next ||= $current;
|
---|
576 | # Making sure it is defined seems to be faster than {no warnings;} in
|
---|
577 | # &process, or passing it in as 0 explicity.
|
---|
578 | # XXX $fallback ||= 0;
|
---|
579 |
|
---|
580 | # Start at the beginning and work forwards through the string to zero.
|
---|
581 | # effectively we are removing 1 character from the front each time
|
---|
582 | # but we don't actually edit the string. [this alone seems to be 14% speedup]
|
---|
583 | # Hence -$pos is the length of the remaining string.
|
---|
584 | my $pos = -length $inbytes;
|
---|
585 | while (1) {
|
---|
586 | my $byte = substr $inbytes, $pos, 1;
|
---|
587 | # RAW_NEXT => 0,
|
---|
588 | # RAW_IN_LEN => 1,
|
---|
589 | # RAW_OUT_BYTES => 2,
|
---|
590 | # RAW_FALLBACK => 3,
|
---|
591 | # to unicode an array would seem to be better, because the pages are dense.
|
---|
592 | # from unicode can be very sparse, favouring a hash.
|
---|
593 | # hash using the bytes (all length 1) as keys rather than ord value,
|
---|
594 | # as it's easier to sort these in &process.
|
---|
595 |
|
---|
596 | # It's faster to always add $fallback even if it's undef, rather than
|
---|
597 | # choosing between 3 and 4 element array. (hence why we set it defined
|
---|
598 | # above)
|
---|
599 | my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,'',$fallback];
|
---|
600 | # When $pos was -1 we were at the last input character.
|
---|
601 | unless (++$pos) {
|
---|
602 | $do_now->[RAW_OUT_BYTES] = $outbytes;
|
---|
603 | $do_now->[RAW_NEXT] = $next;
|
---|
604 | return;
|
---|
605 | }
|
---|
606 | # Tail recursion. The intermdiate state may not have a name yet.
|
---|
607 | $current = $do_now->[RAW_NEXT];
|
---|
608 | }
|
---|
609 | }
|
---|
610 |
|
---|
611 | # This is purely for optimistation. It's just &enter hard coded for $fallback
|
---|
612 | # of 0, using only a 3 entry array ref to save memory for every entry.
|
---|
613 | sub enter_fb0 {
|
---|
614 | my ($current,$inbytes,$outbytes,$next) = @_;
|
---|
615 | $next ||= $current;
|
---|
616 |
|
---|
617 | my $pos = -length $inbytes;
|
---|
618 | while (1) {
|
---|
619 | my $byte = substr $inbytes, $pos, 1;
|
---|
620 | my $do_now = $current->{Raw}{$byte} ||= [{},-$pos,''];
|
---|
621 | unless (++$pos) {
|
---|
622 | $do_now->[RAW_OUT_BYTES] = $outbytes;
|
---|
623 | $do_now->[RAW_NEXT] = $next;
|
---|
624 | return;
|
---|
625 | }
|
---|
626 | $current = $do_now->[RAW_NEXT];
|
---|
627 | }
|
---|
628 | }
|
---|
629 |
|
---|
630 | sub process
|
---|
631 | {
|
---|
632 | my ($name,$a) = @_;
|
---|
633 | $name =~ s/\W+/_/g;
|
---|
634 | $a->{Cname} = $name;
|
---|
635 | my $raw = $a->{Raw};
|
---|
636 | my ($l, $agg_max_in, $agg_next, $agg_in_len, $agg_out_len, $agg_fallback);
|
---|
637 | my @ent;
|
---|
638 | $agg_max_in = 0;
|
---|
639 | foreach my $key (sort keys %$raw) {
|
---|
640 | # RAW_NEXT => 0,
|
---|
641 | # RAW_IN_LEN => 1,
|
---|
642 | # RAW_OUT_BYTES => 2,
|
---|
643 | # RAW_FALLBACK => 3,
|
---|
644 | my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
|
---|
645 | # Now we are converting from raw to aggregate, switch from 1 byte strings
|
---|
646 | # to numbers
|
---|
647 | my $b = ord $key;
|
---|
648 | $fallback ||= 0;
|
---|
649 | if ($l &&
|
---|
650 | # If this == fails, we're going to reset $agg_max_in below anyway.
|
---|
651 | $b == ++$agg_max_in &&
|
---|
652 | # References in numeric context give the pointer as an int.
|
---|
653 | $agg_next == $next &&
|
---|
654 | $agg_in_len == $in_len &&
|
---|
655 | $agg_out_len == length $out_bytes &&
|
---|
656 | $agg_fallback == $fallback
|
---|
657 | # && length($l->[AGG_OUT_BYTES]) < 16
|
---|
658 | ) {
|
---|
659 | # my $i = ord($b)-ord($l->[AGG_MIN_IN]);
|
---|
660 | # we can aggregate this byte onto the end.
|
---|
661 | $l->[AGG_MAX_IN] = $b;
|
---|
662 | $l->[AGG_OUT_BYTES] .= $out_bytes;
|
---|
663 | } else {
|
---|
664 | # AGG_MIN_IN => 0,
|
---|
665 | # AGG_MAX_IN => 1,
|
---|
666 | # AGG_OUT_BYTES => 2,
|
---|
667 | # AGG_NEXT => 3,
|
---|
668 | # AGG_IN_LEN => 4,
|
---|
669 | # AGG_OUT_LEN => 5,
|
---|
670 | # AGG_FALLBACK => 6,
|
---|
671 | # Reset the last thing we saw, plus set 5 lexicals to save some derefs.
|
---|
672 | # (only gains .6% on euc-jp -- is it worth it?)
|
---|
673 | push @ent, $l = [$b, $agg_max_in = $b, $out_bytes, $agg_next = $next,
|
---|
674 | $agg_in_len = $in_len, $agg_out_len = length $out_bytes,
|
---|
675 | $agg_fallback = $fallback];
|
---|
676 | }
|
---|
677 | if (exists $next->{Cname}) {
|
---|
678 | $next->{'Forward'} = 1 if $next != $a;
|
---|
679 | } else {
|
---|
680 | process(sprintf("%s_%02x",$name,$b),$next);
|
---|
681 | }
|
---|
682 | }
|
---|
683 | # encengine.c rules say that last entry must be for 255
|
---|
684 | if ($agg_max_in < 255) {
|
---|
685 | push @ent, [1+$agg_max_in, 255,undef,$a,0,0];
|
---|
686 | }
|
---|
687 | $a->{'Entries'} = \@ent;
|
---|
688 | }
|
---|
689 |
|
---|
690 |
|
---|
691 | sub addstrings
|
---|
692 | {
|
---|
693 | my ($fh,$a) = @_;
|
---|
694 | my $name = $a->{'Cname'};
|
---|
695 | # String tables
|
---|
696 | foreach my $b (@{$a->{'Entries'}})
|
---|
697 | {
|
---|
698 | next unless $b->[AGG_OUT_LEN];
|
---|
699 | $strings{$b->[AGG_OUT_BYTES]} = undef;
|
---|
700 | }
|
---|
701 | if ($a->{'Forward'})
|
---|
702 | {
|
---|
703 | my $var = $^O eq 'MacOS' ? 'extern' : 'static';
|
---|
704 | print $fh "$var encpage_t $name\[",scalar(@{$a->{'Entries'}}),"];\n";
|
---|
705 | }
|
---|
706 | $a->{'DoneStrings'} = 1;
|
---|
707 | foreach my $b (@{$a->{'Entries'}})
|
---|
708 | {
|
---|
709 | my ($s,$e,$out,$t,$end,$l) = @$b;
|
---|
710 | addstrings($fh,$t) unless $t->{'DoneStrings'};
|
---|
711 | }
|
---|
712 | }
|
---|
713 |
|
---|
714 | sub outbigstring
|
---|
715 | {
|
---|
716 | my ($fh,$name) = @_;
|
---|
717 |
|
---|
718 | $string_acc = '';
|
---|
719 |
|
---|
720 | # Make the big string in the string accumulator. Longest first, on the hope
|
---|
721 | # that this makes it more likely that we find the short strings later on.
|
---|
722 | # Not sure if it helps sorting strings of the same length lexcically.
|
---|
723 | foreach my $s (sort {length $b <=> length $a || $a cmp $b} keys %strings) {
|
---|
724 | my $index = index $string_acc, $s;
|
---|
725 | if ($index >= 0) {
|
---|
726 | $saved += length($s);
|
---|
727 | $strings_in_acc{$s} = $index;
|
---|
728 | } else {
|
---|
729 | OPTIMISER: {
|
---|
730 | if ($opt{'O'}) {
|
---|
731 | my $sublength = length $s;
|
---|
732 | while (--$sublength > 0) {
|
---|
733 | # progressively lop characters off the end, to see if the start of
|
---|
734 | # the new string overlaps the end of the accumulator.
|
---|
735 | if (substr ($string_acc, -$sublength)
|
---|
736 | eq substr ($s, 0, $sublength)) {
|
---|
737 | $subsave += $sublength;
|
---|
738 | $strings_in_acc{$s} = length ($string_acc) - $sublength;
|
---|
739 | # append the last bit on the end.
|
---|
740 | $string_acc .= substr ($s, $sublength);
|
---|
741 | last OPTIMISER;
|
---|
742 | }
|
---|
743 | # or if the end of the new string overlaps the start of the
|
---|
744 | # accumulator
|
---|
745 | next unless substr ($string_acc, 0, $sublength)
|
---|
746 | eq substr ($s, -$sublength);
|
---|
747 | # well, the last $sublength characters of the accumulator match.
|
---|
748 | # so as we're prepending to the accumulator, need to shift all our
|
---|
749 | # existing offsets forwards
|
---|
750 | $_ += $sublength foreach values %strings_in_acc;
|
---|
751 | $subsave += $sublength;
|
---|
752 | $strings_in_acc{$s} = 0;
|
---|
753 | # append the first bit on the start.
|
---|
754 | $string_acc = substr ($s, 0, -$sublength) . $string_acc;
|
---|
755 | last OPTIMISER;
|
---|
756 | }
|
---|
757 | }
|
---|
758 | # Optimiser (if it ran) found nothing, so just going have to tack the
|
---|
759 | # whole thing on the end.
|
---|
760 | $strings_in_acc{$s} = length $string_acc;
|
---|
761 | $string_acc .= $s;
|
---|
762 | };
|
---|
763 | }
|
---|
764 | }
|
---|
765 |
|
---|
766 | $strings = length $string_acc;
|
---|
767 | my $definition = "\nstatic const U8 $name\[$strings] = { " .
|
---|
768 | join(',',unpack "C*",$string_acc);
|
---|
769 | # We have a single long line. Split it at convenient commas.
|
---|
770 | print $fh $1, "\n" while $definition =~ /\G(.{74,77},)/gcs;
|
---|
771 | print $fh substr ($definition, pos $definition), " };\n";
|
---|
772 | }
|
---|
773 |
|
---|
774 | sub findstring {
|
---|
775 | my ($name,$s) = @_;
|
---|
776 | my $offset = $strings_in_acc{$s};
|
---|
777 | die "Can't find string " . join (',',unpack "C*",$s) . " in accumulator"
|
---|
778 | unless defined $offset;
|
---|
779 | "$name + $offset";
|
---|
780 | }
|
---|
781 |
|
---|
782 | sub outtable
|
---|
783 | {
|
---|
784 | my ($fh,$a,$bigname) = @_;
|
---|
785 | my $name = $a->{'Cname'};
|
---|
786 | $a->{'Done'} = 1;
|
---|
787 | foreach my $b (@{$a->{'Entries'}})
|
---|
788 | {
|
---|
789 | my ($s,$e,$out,$t,$end,$l) = @$b;
|
---|
790 | outtable($fh,$t,$bigname) unless $t->{'Done'};
|
---|
791 | }
|
---|
792 | print $fh "\nstatic encpage_t $name\[",scalar(@{$a->{'Entries'}}),"] = {\n";
|
---|
793 | foreach my $b (@{$a->{'Entries'}})
|
---|
794 | {
|
---|
795 | my ($sc,$ec,$out,$t,$end,$l,$fb) = @$b;
|
---|
796 | # $end |= 0x80 if $fb; # what the heck was on your mind, Nick? -- Dan
|
---|
797 | print $fh "{";
|
---|
798 | if ($l)
|
---|
799 | {
|
---|
800 | printf $fh findstring($bigname,$out);
|
---|
801 | }
|
---|
802 | else
|
---|
803 | {
|
---|
804 | print $fh "0";
|
---|
805 | }
|
---|
806 | print $fh ",",$t->{Cname};
|
---|
807 | printf $fh ",0x%02x,0x%02x,$l,$end},\n",$sc,$ec;
|
---|
808 | }
|
---|
809 | print $fh "};\n";
|
---|
810 | }
|
---|
811 |
|
---|
812 | sub output_enc
|
---|
813 | {
|
---|
814 | my ($fh,$name,$a) = @_;
|
---|
815 | die "Changed - fix me for new structure";
|
---|
816 | foreach my $b (sort keys %$a)
|
---|
817 | {
|
---|
818 | my ($s,$e,$out,$t,$end,$l,$fb) = @{$a->{$b}};
|
---|
819 | }
|
---|
820 | }
|
---|
821 |
|
---|
822 | sub decode_U
|
---|
823 | {
|
---|
824 | my $s = shift;
|
---|
825 | }
|
---|
826 |
|
---|
827 | my @uname;
|
---|
828 | sub char_names
|
---|
829 | {
|
---|
830 | my $s = do "unicore/Name.pl";
|
---|
831 | die "char_names: unicore/Name.pl: $!\n" unless defined $s;
|
---|
832 | pos($s) = 0;
|
---|
833 | while ($s =~ /\G([0-9a-f]+)\t([0-9a-f]*)\t(.*?)\s*\n/igc)
|
---|
834 | {
|
---|
835 | my $name = $3;
|
---|
836 | my $s = hex($1);
|
---|
837 | last if $s >= 0x10000;
|
---|
838 | my $e = length($2) ? hex($2) : $s;
|
---|
839 | for (my $i = $s; $i <= $e; $i++)
|
---|
840 | {
|
---|
841 | $uname[$i] = $name;
|
---|
842 | # print sprintf("U%04X $name\n",$i);
|
---|
843 | }
|
---|
844 | }
|
---|
845 | }
|
---|
846 |
|
---|
847 | sub output_ucm_page
|
---|
848 | {
|
---|
849 | my ($cmap,$a,$t,$pre) = @_;
|
---|
850 | # warn sprintf("Page %x\n",$pre);
|
---|
851 | my $raw = $t->{Raw};
|
---|
852 | foreach my $key (sort keys %$raw) {
|
---|
853 | # RAW_NEXT => 0,
|
---|
854 | # RAW_IN_LEN => 1,
|
---|
855 | # RAW_OUT_BYTES => 2,
|
---|
856 | # RAW_FALLBACK => 3,
|
---|
857 | my ($next, $in_len, $out_bytes, $fallback) = @{$raw->{$key}};
|
---|
858 | my $u = ord $key;
|
---|
859 | $fallback ||= 0;
|
---|
860 |
|
---|
861 | if ($next != $a && $next != $t) {
|
---|
862 | output_ucm_page($cmap,$a,$next,(($pre|($u &0x3F)) << 6)&0xFFFF);
|
---|
863 | } elsif (length $out_bytes) {
|
---|
864 | if ($pre) {
|
---|
865 | $u = $pre|($u &0x3f);
|
---|
866 | }
|
---|
867 | my $s = sprintf "<U%04X> ",$u;
|
---|
868 | #foreach my $c (split(//,$out_bytes)) {
|
---|
869 | # $s .= sprintf "\\x%02X",ord($c);
|
---|
870 | #}
|
---|
871 | # 9.5% faster changing that loop to this:
|
---|
872 | $s .= sprintf +("\\x%02X" x length $out_bytes), unpack "C*", $out_bytes;
|
---|
873 | $s .= sprintf " |%d # %s\n",($fallback ? 1 : 0),$uname[$u];
|
---|
874 | push(@$cmap,$s);
|
---|
875 | } else {
|
---|
876 | warn join(',',$u, @{$raw->{$key}},$a,$t);
|
---|
877 | }
|
---|
878 | }
|
---|
879 | }
|
---|
880 |
|
---|
881 | sub output_ucm
|
---|
882 | {
|
---|
883 | my ($fh,$name,$h,$rep,$min_el,$max_el) = @_;
|
---|
884 | print $fh "# $0 @orig_ARGV\n" unless $opt{'q'};
|
---|
885 | print $fh "<code_set_name> \"$name\"\n";
|
---|
886 | char_names();
|
---|
887 | if (defined $min_el)
|
---|
888 | {
|
---|
889 | print $fh "<mb_cur_min> $min_el\n";
|
---|
890 | }
|
---|
891 | if (defined $max_el)
|
---|
892 | {
|
---|
893 | print $fh "<mb_cur_max> $max_el\n";
|
---|
894 | }
|
---|
895 | if (defined $rep)
|
---|
896 | {
|
---|
897 | print $fh "<subchar> ";
|
---|
898 | foreach my $c (split(//,$rep))
|
---|
899 | {
|
---|
900 | printf $fh "\\x%02X",ord($c);
|
---|
901 | }
|
---|
902 | print $fh "\n";
|
---|
903 | }
|
---|
904 | my @cmap;
|
---|
905 | output_ucm_page(\@cmap,$h,$h,0);
|
---|
906 | print $fh "#\nCHARMAP\n";
|
---|
907 | foreach my $line (sort { substr($a,8) cmp substr($b,8) } @cmap)
|
---|
908 | {
|
---|
909 | print $fh $line;
|
---|
910 | }
|
---|
911 | print $fh "END CHARMAP\n";
|
---|
912 | }
|
---|
913 |
|
---|
914 | use vars qw(
|
---|
915 | $_Enc2xs
|
---|
916 | $_Version
|
---|
917 | $_Inc
|
---|
918 | $_E2X
|
---|
919 | $_Name
|
---|
920 | $_TableFiles
|
---|
921 | $_Now
|
---|
922 | );
|
---|
923 |
|
---|
924 | sub find_e2x{
|
---|
925 | eval { require File::Find; };
|
---|
926 | my (@inc, %e2x_dir);
|
---|
927 | for my $inc (@INC){
|
---|
928 | push @inc, $inc unless $inc eq '.'; #skip current dir
|
---|
929 | }
|
---|
930 | File::Find::find(
|
---|
931 | sub {
|
---|
932 | my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
|
---|
933 | $atime,$mtime,$ctime,$blksize,$blocks)
|
---|
934 | = lstat($_) or return;
|
---|
935 | -f _ or return;
|
---|
936 | if (/^.*\.e2x$/o){
|
---|
937 | no warnings 'once';
|
---|
938 | $e2x_dir{$File::Find::dir} ||= $mtime;
|
---|
939 | }
|
---|
940 | return;
|
---|
941 | }, @inc);
|
---|
942 | warn join("\n", keys %e2x_dir), "\n";
|
---|
943 | for my $d (sort {$e2x_dir{$a} <=> $e2x_dir{$b}} keys %e2x_dir){
|
---|
944 | $_E2X = $d;
|
---|
945 | # warn "$_E2X => ", scalar localtime($e2x_dir{$d});
|
---|
946 | return $_E2X;
|
---|
947 | }
|
---|
948 | }
|
---|
949 |
|
---|
950 | sub make_makefile_pl
|
---|
951 | {
|
---|
952 | eval { require Encode; };
|
---|
953 | $@ and die "You need to install Encode to use enc2xs -M\nerror: $@\n";
|
---|
954 | # our used for variable expanstion
|
---|
955 | $_Enc2xs = $0;
|
---|
956 | $_Version = $VERSION;
|
---|
957 | $_E2X = find_e2x();
|
---|
958 | $_Name = shift;
|
---|
959 | $_TableFiles = join(",", map {qq('$_')} @_);
|
---|
960 | $_Now = scalar localtime();
|
---|
961 |
|
---|
962 | eval { require File::Spec; };
|
---|
963 | _print_expand(File::Spec->catfile($_E2X,"Makefile_PL.e2x"),"Makefile.PL");
|
---|
964 | _print_expand(File::Spec->catfile($_E2X,"_PM.e2x"), "$_Name.pm");
|
---|
965 | _print_expand(File::Spec->catfile($_E2X,"_T.e2x"), "t/$_Name.t");
|
---|
966 | _print_expand(File::Spec->catfile($_E2X,"README.e2x"), "README");
|
---|
967 | _print_expand(File::Spec->catfile($_E2X,"Changes.e2x"), "Changes");
|
---|
968 | exit;
|
---|
969 | }
|
---|
970 |
|
---|
971 | use vars qw(
|
---|
972 | $_ModLines
|
---|
973 | $_LocalVer
|
---|
974 | );
|
---|
975 |
|
---|
976 | sub make_configlocal_pm
|
---|
977 | {
|
---|
978 | eval { require Encode; };
|
---|
979 | $@ and die "Unable to require Encode: $@\n";
|
---|
980 | eval { require File::Spec; };
|
---|
981 | # our used for variable expanstion
|
---|
982 | my %in_core = map {$_=>1}('ascii','iso-8859-1','utf8');
|
---|
983 | my %LocalMod = ();
|
---|
984 | for my $d (@INC){
|
---|
985 | my $inc = File::Spec->catfile($d, "Encode");
|
---|
986 | -d $inc or next;
|
---|
987 | opendir my $dh, $inc or die "$inc:$!";
|
---|
988 | warn "Checking $inc...\n";
|
---|
989 | for my $f (grep /\.pm$/o, readdir($dh)){
|
---|
990 | -f File::Spec->catfile($inc, "$f") or next;
|
---|
991 | $INC{"Encode/$f"} and next;
|
---|
992 | warn "require Encode/$f;\n";
|
---|
993 | eval { require "Encode/$f"; };
|
---|
994 | $@ and die "Can't require Encode/$f: $@\n";
|
---|
995 | for my $enc (Encode->encodings()){
|
---|
996 | no warnings 'once';
|
---|
997 | $in_core{$enc} and next;
|
---|
998 | $Encode::Config::ExtModule{$enc} and next;
|
---|
999 | my $mod = "Encode/$f";
|
---|
1000 | $mod =~ s/\.pm$//o; $mod =~ s,/,::,og;
|
---|
1001 | $LocalMod{$enc} ||= $mod;
|
---|
1002 | }
|
---|
1003 | }
|
---|
1004 | }
|
---|
1005 | $_ModLines = "";
|
---|
1006 | for my $enc (sort keys %LocalMod){
|
---|
1007 | $_ModLines .=
|
---|
1008 | qq(\$Encode::ExtModule{'$enc'} =\t"$LocalMod{$enc}";\n);
|
---|
1009 | }
|
---|
1010 | warn $_ModLines;
|
---|
1011 | $_LocalVer = _mkversion();
|
---|
1012 | $_E2X = find_e2x();
|
---|
1013 | $_Inc = $INC{"Encode.pm"}; $_Inc =~ s/\.pm$//o;
|
---|
1014 | _print_expand(File::Spec->catfile($_E2X,"ConfigLocal_PM.e2x"),
|
---|
1015 | File::Spec->catfile($_Inc,"ConfigLocal.pm"),
|
---|
1016 | 1);
|
---|
1017 | exit;
|
---|
1018 | }
|
---|
1019 |
|
---|
1020 | sub _mkversion{
|
---|
1021 | my ($ss,$mm,$hh,$dd,$mo,$yyyy) = localtime();
|
---|
1022 | $yyyy += 1900, $mo +=1;
|
---|
1023 | return sprintf("v%04d.%04d.%04d", $yyyy, $mo*100+$dd, $hh*100+$mm);
|
---|
1024 | }
|
---|
1025 |
|
---|
1026 | sub _print_expand{
|
---|
1027 | eval { require File::Basename; };
|
---|
1028 | $@ and die "File::Basename needed. Are you on miniperl?;\nerror: $@\n";
|
---|
1029 | File::Basename->import();
|
---|
1030 | my ($src, $dst, $clobber) = @_;
|
---|
1031 | if (!$clobber and -e $dst){
|
---|
1032 | warn "$dst exists. skipping\n";
|
---|
1033 | return;
|
---|
1034 | }
|
---|
1035 | warn "Generating $dst...\n";
|
---|
1036 | open my $in, $src or die "$src : $!";
|
---|
1037 | if ((my $d = dirname($dst)) ne '.'){
|
---|
1038 | -d $d or mkdir $d, 0755 or die "mkdir $d : $!";
|
---|
1039 | }
|
---|
1040 | open my $out, ">$dst" or die "$!";
|
---|
1041 | my $asis = 0;
|
---|
1042 | while (<$in>){
|
---|
1043 | if (/^#### END_OF_HEADER/){
|
---|
1044 | $asis = 1; next;
|
---|
1045 | }
|
---|
1046 | s/(\$_[A-Z][A-Za-z0-9]+)_/$1/gee unless $asis;
|
---|
1047 | print $out $_;
|
---|
1048 | }
|
---|
1049 | }
|
---|
1050 | __END__
|
---|
1051 |
|
---|
1052 | =head1 NAME
|
---|
1053 |
|
---|
1054 | enc2xs -- Perl Encode Module Generator
|
---|
1055 |
|
---|
1056 | =head1 SYNOPSIS
|
---|
1057 |
|
---|
1058 | enc2xs -[options]
|
---|
1059 | enc2xs -M ModName mapfiles...
|
---|
1060 | enc2xs -C
|
---|
1061 |
|
---|
1062 | =head1 DESCRIPTION
|
---|
1063 |
|
---|
1064 | F<enc2xs> builds a Perl extension for use by Encode from either
|
---|
1065 | Unicode Character Mapping files (.ucm) or Tcl Encoding Files (.enc).
|
---|
1066 | Besides being used internally during the build process of the Encode
|
---|
1067 | module, you can use F<enc2xs> to add your own encoding to perl.
|
---|
1068 | No knowledge of XS is necessary.
|
---|
1069 |
|
---|
1070 | =head1 Quick Guide
|
---|
1071 |
|
---|
1072 | If you want to know as little about Perl as possible but need to
|
---|
1073 | add a new encoding, just read this chapter and forget the rest.
|
---|
1074 |
|
---|
1075 | =over 4
|
---|
1076 |
|
---|
1077 | =item 0.
|
---|
1078 |
|
---|
1079 | Have a .ucm file ready. You can get it from somewhere or you can write
|
---|
1080 | your own from scratch or you can grab one from the Encode distribution
|
---|
1081 | and customize it. For the UCM format, see the next Chapter. In the
|
---|
1082 | example below, I'll call my theoretical encoding myascii, defined
|
---|
1083 | in I<my.ucm>. C<$> is a shell prompt.
|
---|
1084 |
|
---|
1085 | $ ls -F
|
---|
1086 | my.ucm
|
---|
1087 |
|
---|
1088 | =item 1.
|
---|
1089 |
|
---|
1090 | Issue a command as follows;
|
---|
1091 |
|
---|
1092 | $ enc2xs -M My my.ucm
|
---|
1093 | generating Makefile.PL
|
---|
1094 | generating My.pm
|
---|
1095 | generating README
|
---|
1096 | generating Changes
|
---|
1097 |
|
---|
1098 | Now take a look at your current directory. It should look like this.
|
---|
1099 |
|
---|
1100 | $ ls -F
|
---|
1101 | Makefile.PL My.pm my.ucm t/
|
---|
1102 |
|
---|
1103 | The following files were created.
|
---|
1104 |
|
---|
1105 | Makefile.PL - MakeMaker script
|
---|
1106 | My.pm - Encode submodule
|
---|
1107 | t/My.t - test file
|
---|
1108 |
|
---|
1109 | =over 4
|
---|
1110 |
|
---|
1111 | =item 1.1.
|
---|
1112 |
|
---|
1113 | If you want *.ucm installed together with the modules, do as follows;
|
---|
1114 |
|
---|
1115 | $ mkdir Encode
|
---|
1116 | $ mv *.ucm Encode
|
---|
1117 | $ enc2xs -M My Encode/*ucm
|
---|
1118 |
|
---|
1119 | =back
|
---|
1120 |
|
---|
1121 | =item 2.
|
---|
1122 |
|
---|
1123 | Edit the files generated. You don't have to if you have no time AND no
|
---|
1124 | intention to give it to someone else. But it is a good idea to edit
|
---|
1125 | the pod and to add more tests.
|
---|
1126 |
|
---|
1127 | =item 3.
|
---|
1128 |
|
---|
1129 | Now issue a command all Perl Mongers love:
|
---|
1130 |
|
---|
1131 | $ perl Makefile.PL
|
---|
1132 | Writing Makefile for Encode::My
|
---|
1133 |
|
---|
1134 | =item 4.
|
---|
1135 |
|
---|
1136 | Now all you have to do is make.
|
---|
1137 |
|
---|
1138 | $ make
|
---|
1139 | cp My.pm blib/lib/Encode/My.pm
|
---|
1140 | /usr/local/bin/perl /usr/local/bin/enc2xs -Q -O \
|
---|
1141 | -o encode_t.c -f encode_t.fnm
|
---|
1142 | Reading myascii (myascii)
|
---|
1143 | Writing compiled form
|
---|
1144 | 128 bytes in string tables
|
---|
1145 | 384 bytes (75%) saved spotting duplicates
|
---|
1146 | 1 bytes (0.775%) saved using substrings
|
---|
1147 | ....
|
---|
1148 | chmod 644 blib/arch/auto/Encode/My/My.bs
|
---|
1149 | $
|
---|
1150 |
|
---|
1151 | The time it takes varies depending on how fast your machine is and
|
---|
1152 | how large your encoding is. Unless you are working on something big
|
---|
1153 | like euc-tw, it won't take too long.
|
---|
1154 |
|
---|
1155 | =item 5.
|
---|
1156 |
|
---|
1157 | You can "make install" already but you should test first.
|
---|
1158 |
|
---|
1159 | $ make test
|
---|
1160 | PERL_DL_NONLAZY=1 /usr/local/bin/perl -Iblib/arch -Iblib/lib \
|
---|
1161 | -e 'use Test::Harness qw(&runtests $verbose); \
|
---|
1162 | $verbose=0; runtests @ARGV;' t/*.t
|
---|
1163 | t/My....ok
|
---|
1164 | All tests successful.
|
---|
1165 | Files=1, Tests=2, 0 wallclock secs
|
---|
1166 | ( 0.09 cusr + 0.01 csys = 0.09 CPU)
|
---|
1167 |
|
---|
1168 | =item 6.
|
---|
1169 |
|
---|
1170 | If you are content with the test result, just "make install"
|
---|
1171 |
|
---|
1172 | =item 7.
|
---|
1173 |
|
---|
1174 | If you want to add your encoding to Encode's demand-loading list
|
---|
1175 | (so you don't have to "use Encode::YourEncoding"), run
|
---|
1176 |
|
---|
1177 | enc2xs -C
|
---|
1178 |
|
---|
1179 | to update Encode::ConfigLocal, a module that controls local settings.
|
---|
1180 | After that, "use Encode;" is enough to load your encodings on demand.
|
---|
1181 |
|
---|
1182 | =back
|
---|
1183 |
|
---|
1184 | =head1 The Unicode Character Map
|
---|
1185 |
|
---|
1186 | Encode uses the Unicode Character Map (UCM) format for source character
|
---|
1187 | mappings. This format is used by IBM's ICU package and was adopted
|
---|
1188 | by Nick Ing-Simmons for use with the Encode module. Since UCM is
|
---|
1189 | more flexible than Tcl's Encoding Map and far more user-friendly,
|
---|
1190 | this is the recommended formet for Encode now.
|
---|
1191 |
|
---|
1192 | A UCM file looks like this.
|
---|
1193 |
|
---|
1194 | #
|
---|
1195 | # Comments
|
---|
1196 | #
|
---|
1197 | <code_set_name> "US-ascii" # Required
|
---|
1198 | <code_set_alias> "ascii" # Optional
|
---|
1199 | <mb_cur_min> 1 # Required; usually 1
|
---|
1200 | <mb_cur_max> 1 # Max. # of bytes/char
|
---|
1201 | <subchar> \x3F # Substitution char
|
---|
1202 | #
|
---|
1203 | CHARMAP
|
---|
1204 | <U0000> \x00 |0 # <control>
|
---|
1205 | <U0001> \x01 |0 # <control>
|
---|
1206 | <U0002> \x02 |0 # <control>
|
---|
1207 | ....
|
---|
1208 | <U007C> \x7C |0 # VERTICAL LINE
|
---|
1209 | <U007D> \x7D |0 # RIGHT CURLY BRACKET
|
---|
1210 | <U007E> \x7E |0 # TILDE
|
---|
1211 | <U007F> \x7F |0 # <control>
|
---|
1212 | END CHARMAP
|
---|
1213 |
|
---|
1214 | =over 4
|
---|
1215 |
|
---|
1216 | =item *
|
---|
1217 |
|
---|
1218 | Anything that follows C<#> is treated as a comment.
|
---|
1219 |
|
---|
1220 | =item *
|
---|
1221 |
|
---|
1222 | The header section continues until a line containing the word
|
---|
1223 | CHARMAP. This section has a form of I<E<lt>keywordE<gt> value>, one
|
---|
1224 | pair per line. Strings used as values must be quoted. Barewords are
|
---|
1225 | treated as numbers. I<\xXX> represents a byte.
|
---|
1226 |
|
---|
1227 | Most of the keywords are self-explanatory. I<subchar> means
|
---|
1228 | substitution character, not subcharacter. When you decode a Unicode
|
---|
1229 | sequence to this encoding but no matching character is found, the byte
|
---|
1230 | sequence defined here will be used. For most cases, the value here is
|
---|
1231 | \x3F; in ASCII, this is a question mark.
|
---|
1232 |
|
---|
1233 | =item *
|
---|
1234 |
|
---|
1235 | CHARMAP starts the character map section. Each line has a form as
|
---|
1236 | follows:
|
---|
1237 |
|
---|
1238 | <UXXXX> \xXX.. |0 # comment
|
---|
1239 | ^ ^ ^
|
---|
1240 | | | +- Fallback flag
|
---|
1241 | | +-------- Encoded byte sequence
|
---|
1242 | +-------------- Unicode Character ID in hex
|
---|
1243 |
|
---|
1244 | The format is roughly the same as a header section except for the
|
---|
1245 | fallback flag: | followed by 0..3. The meaning of the possible
|
---|
1246 | values is as follows:
|
---|
1247 |
|
---|
1248 | =over 4
|
---|
1249 |
|
---|
1250 | =item |0
|
---|
1251 |
|
---|
1252 | Round trip safe. A character decoded to Unicode encodes back to the
|
---|
1253 | same byte sequence. Most characters have this flag.
|
---|
1254 |
|
---|
1255 | =item |1
|
---|
1256 |
|
---|
1257 | Fallback for unicode -> encoding. When seen, enc2xs adds this
|
---|
1258 | character for the encode map only.
|
---|
1259 |
|
---|
1260 | =item |2
|
---|
1261 |
|
---|
1262 | Skip sub-char mapping should there be no code point.
|
---|
1263 |
|
---|
1264 | =item |3
|
---|
1265 |
|
---|
1266 | Fallback for encoding -> unicode. When seen, enc2xs adds this
|
---|
1267 | character for the decode map only.
|
---|
1268 |
|
---|
1269 | =back
|
---|
1270 |
|
---|
1271 | =item *
|
---|
1272 |
|
---|
1273 | And finally, END OF CHARMAP ends the section.
|
---|
1274 |
|
---|
1275 | =back
|
---|
1276 |
|
---|
1277 | When you are manually creating a UCM file, you should copy ascii.ucm
|
---|
1278 | or an existing encoding which is close to yours, rather than write
|
---|
1279 | your own from scratch.
|
---|
1280 |
|
---|
1281 | When you do so, make sure you leave at least B<U0000> to B<U0020> as
|
---|
1282 | is, unless your environment is EBCDIC.
|
---|
1283 |
|
---|
1284 | B<CAVEAT>: not all features in UCM are implemented. For example,
|
---|
1285 | icu:state is not used. Because of that, you need to write a perl
|
---|
1286 | module if you want to support algorithmical encodings, notably
|
---|
1287 | the ISO-2022 series. Such modules include L<Encode::JP::2022_JP>,
|
---|
1288 | L<Encode::KR::2022_KR>, and L<Encode::TW::HZ>.
|
---|
1289 |
|
---|
1290 | =head2 Coping with duplicate mappings
|
---|
1291 |
|
---|
1292 | When you create a map, you SHOULD make your mappings round-trip safe.
|
---|
1293 | That is, C<encode('your-encoding', decode('your-encoding', $data)) eq
|
---|
1294 | $data> stands for all characters that are marked as C<|0>. Here is
|
---|
1295 | how to make sure:
|
---|
1296 |
|
---|
1297 | =over 4
|
---|
1298 |
|
---|
1299 | =item *
|
---|
1300 |
|
---|
1301 | Sort your map in Unicode order.
|
---|
1302 |
|
---|
1303 | =item *
|
---|
1304 |
|
---|
1305 | When you have a duplicate entry, mark either one with '|1' or '|3'.
|
---|
1306 |
|
---|
1307 | =item *
|
---|
1308 |
|
---|
1309 | And make sure the '|1' or '|3' entry FOLLOWS the '|0' entry.
|
---|
1310 |
|
---|
1311 | =back
|
---|
1312 |
|
---|
1313 | Here is an example from big5-eten.
|
---|
1314 |
|
---|
1315 | <U2550> \xF9\xF9 |0
|
---|
1316 | <U2550> \xA2\xA4 |3
|
---|
1317 |
|
---|
1318 | Internally Encoding -> Unicode and Unicode -> Encoding Map looks like
|
---|
1319 | this;
|
---|
1320 |
|
---|
1321 | E to U U to E
|
---|
1322 | --------------------------------------
|
---|
1323 | \xF9\xF9 => U2550 U2550 => \xF9\xF9
|
---|
1324 | \xA2\xA4 => U2550
|
---|
1325 |
|
---|
1326 | So it is round-trip safe for \xF9\xF9. But if the line above is upside
|
---|
1327 | down, here is what happens.
|
---|
1328 |
|
---|
1329 | E to U U to E
|
---|
1330 | --------------------------------------
|
---|
1331 | \xA2\xA4 => U2550 U2550 => \xF9\xF9
|
---|
1332 | (\xF9\xF9 => U2550 is now overwritten!)
|
---|
1333 |
|
---|
1334 | The Encode package comes with F<ucmlint>, a crude but sufficient
|
---|
1335 | utility to check the integrity of a UCM file. Check under the
|
---|
1336 | Encode/bin directory for this.
|
---|
1337 |
|
---|
1338 | When in doubt, you can use F<ucmsort>, yet another utility under
|
---|
1339 | Encode/bin directory.
|
---|
1340 |
|
---|
1341 | =head1 Bookmarks
|
---|
1342 |
|
---|
1343 | =over 4
|
---|
1344 |
|
---|
1345 | =item *
|
---|
1346 |
|
---|
1347 | ICU Home Page
|
---|
1348 | L<http://oss.software.ibm.com/icu/>
|
---|
1349 |
|
---|
1350 | =item *
|
---|
1351 |
|
---|
1352 | ICU Character Mapping Tables
|
---|
1353 | L<http://oss.software.ibm.com/icu/charset/>
|
---|
1354 |
|
---|
1355 | =item *
|
---|
1356 |
|
---|
1357 | ICU:Conversion Data
|
---|
1358 | L<http://oss.software.ibm.com/icu/userguide/conversion-data.html>
|
---|
1359 |
|
---|
1360 | =back
|
---|
1361 |
|
---|
1362 | =head1 SEE ALSO
|
---|
1363 |
|
---|
1364 | L<Encode>,
|
---|
1365 | L<perlmod>,
|
---|
1366 | L<perlpod>
|
---|
1367 |
|
---|
1368 | =cut
|
---|
1369 |
|
---|
1370 | # -Q to disable the duplicate codepoint test
|
---|
1371 | # -S make mapping errors fatal
|
---|
1372 | # -q to remove comments written to output files
|
---|
1373 | # -O to enable the (brute force) substring optimiser
|
---|
1374 | # -o <output> to specify the output file name (else it's the first arg)
|
---|
1375 | # -f <inlist> to give a file with a list of input files (else use the args)
|
---|
1376 | # -n <name> to name the encoding (else use the basename of the input file.
|
---|
1377 |
|
---|
1378 | With %seen holding array refs:
|
---|
1379 |
|
---|
1380 | 865.66 real 28.80 user 8.79 sys
|
---|
1381 | 7904 maximum resident set size
|
---|
1382 | 1356 average shared memory size
|
---|
1383 | 18566 average unshared data size
|
---|
1384 | 229 average unshared stack size
|
---|
1385 | 46080 page reclaims
|
---|
1386 | 33373 page faults
|
---|
1387 |
|
---|
1388 | With %seen holding simple scalars:
|
---|
1389 |
|
---|
1390 | 342.16 real 27.11 user 3.54 sys
|
---|
1391 | 8388 maximum resident set size
|
---|
1392 | 1394 average shared memory size
|
---|
1393 | 14969 average unshared data size
|
---|
1394 | 236 average unshared stack size
|
---|
1395 | 28159 page reclaims
|
---|
1396 | 9839 page faults
|
---|
1397 |
|
---|
1398 | Yes, 5 minutes is faster than 15. Above is for CP936 in CN. Only difference is
|
---|
1399 | how %seen is storing things its seen. So it is pathalogically bad on a 16M
|
---|
1400 | RAM machine, but it's going to help even on modern machines.
|
---|
1401 | Swapping is bad, m'kay :-)
|
---|
1402 |
|
---|
1403 | __END__
|
---|
1404 | :endofperl
|
---|