1 | package Encode::CN::HZ;
|
---|
2 |
|
---|
3 | use strict;
|
---|
4 |
|
---|
5 | use vars qw($VERSION);
|
---|
6 | $VERSION = do { my @r = (q$Revision: 2.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
|
---|
7 |
|
---|
8 | use Encode qw(:fallbacks);
|
---|
9 |
|
---|
10 | use base qw(Encode::Encoding);
|
---|
11 | __PACKAGE__->Define('hz');
|
---|
12 |
|
---|
13 | # HZ is a combination of ASCII and escaped GB, so we implement it
|
---|
14 | # with the GB2312(raw) encoding here. Cf. RFCs 1842 & 1843.
|
---|
15 |
|
---|
16 | # not ported for EBCDIC. Which should be used, "~" or "\x7E"?
|
---|
17 |
|
---|
18 | sub needs_lines { 1 }
|
---|
19 |
|
---|
20 | sub decode ($$;$)
|
---|
21 | {
|
---|
22 | my ($obj,$str,$chk) = @_;
|
---|
23 |
|
---|
24 | my $GB = Encode::find_encoding('gb2312-raw');
|
---|
25 | my $ret = '';
|
---|
26 | my $in_ascii = 1; # default mode is ASCII.
|
---|
27 |
|
---|
28 | while (length $str) {
|
---|
29 | if ($in_ascii) { # ASCII mode
|
---|
30 | if ($str =~ s/^([\x00-\x7D\x7F]+)//) { # no '~' => ASCII
|
---|
31 | $ret .= $1;
|
---|
32 | # EBCDIC should need ascii2native, but not ported.
|
---|
33 | }
|
---|
34 | elsif ($str =~ s/^\x7E\x7E//) { # escaped tilde
|
---|
35 | $ret .= '~';
|
---|
36 | }
|
---|
37 | elsif ($str =~ s/^\x7E\cJ//) { # '\cJ' == LF in ASCII
|
---|
38 | 1; # no-op
|
---|
39 | }
|
---|
40 | elsif ($str =~ s/^\x7E\x7B//) { # '~{'
|
---|
41 | $in_ascii = 0; # to GB
|
---|
42 | }
|
---|
43 | else { # encounters an invalid escape, \x80 or greater
|
---|
44 | last;
|
---|
45 | }
|
---|
46 | }
|
---|
47 | else { # GB mode; the byte ranges are as in RFC 1843.
|
---|
48 | if ($str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)//) {
|
---|
49 | $ret .= $GB->decode($1, $chk);
|
---|
50 | }
|
---|
51 | elsif ($str =~ s/^\x7E\x7D//) { # '~}'
|
---|
52 | $in_ascii = 1;
|
---|
53 | }
|
---|
54 | else { # invalid
|
---|
55 | last;
|
---|
56 | }
|
---|
57 | }
|
---|
58 | }
|
---|
59 | $_[1] = '' if $chk; # needs_lines guarantees no partial character
|
---|
60 | return $ret;
|
---|
61 | }
|
---|
62 |
|
---|
63 | sub cat_decode {
|
---|
64 | my ($obj, undef, $src, $pos, $trm, $chk) = @_;
|
---|
65 | my ($rdst, $rsrc, $rpos) = \@_[1..3];
|
---|
66 |
|
---|
67 | my $GB = Encode::find_encoding('gb2312-raw');
|
---|
68 | my $ret = '';
|
---|
69 | my $in_ascii = 1; # default mode is ASCII.
|
---|
70 |
|
---|
71 | my $ini_pos = pos($$rsrc);
|
---|
72 |
|
---|
73 | substr($src, 0, $pos) = '';
|
---|
74 |
|
---|
75 | my $ini_len = bytes::length($src);
|
---|
76 |
|
---|
77 | # $trm is the first of the pair '~~', then 2nd tilde is to be removed.
|
---|
78 | # XXX: Is better C<$src =~ s/^\x7E// or die if ...>?
|
---|
79 | $src =~ s/^\x7E// if $trm eq "\x7E";
|
---|
80 |
|
---|
81 | while (length $src) {
|
---|
82 | my $now;
|
---|
83 | if ($in_ascii) { # ASCII mode
|
---|
84 | if ($src =~ s/^([\x00-\x7D\x7F])//) { # no '~' => ASCII
|
---|
85 | $now = $1;
|
---|
86 | }
|
---|
87 | elsif ($src =~ s/^\x7E\x7E//) { # escaped tilde
|
---|
88 | $now = '~';
|
---|
89 | }
|
---|
90 | elsif ($src =~ s/^\x7E\cJ//) { # '\cJ' == LF in ASCII
|
---|
91 | next;
|
---|
92 | }
|
---|
93 | elsif ($src =~ s/^\x7E\x7B//) { # '~{'
|
---|
94 | $in_ascii = 0; # to GB
|
---|
95 | next;
|
---|
96 | }
|
---|
97 | else { # encounters an invalid escape, \x80 or greater
|
---|
98 | last;
|
---|
99 | }
|
---|
100 | }
|
---|
101 | else { # GB mode; the byte ranges are as in RFC 1843.
|
---|
102 | if ($src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)//) {
|
---|
103 | $now = $GB->decode($1, $chk);
|
---|
104 | }
|
---|
105 | elsif ($src =~ s/^\x7E\x7D//) { # '~}'
|
---|
106 | $in_ascii = 1;
|
---|
107 | next;
|
---|
108 | }
|
---|
109 | else { # invalid
|
---|
110 | last;
|
---|
111 | }
|
---|
112 | }
|
---|
113 |
|
---|
114 | next if ! defined $now;
|
---|
115 |
|
---|
116 | $ret .= $now;
|
---|
117 |
|
---|
118 | if ($now eq $trm) {
|
---|
119 | $$rdst .= $ret;
|
---|
120 | $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src);
|
---|
121 | pos($$rsrc) = $ini_pos;
|
---|
122 | return 1;
|
---|
123 | }
|
---|
124 | }
|
---|
125 |
|
---|
126 | $$rdst .= $ret;
|
---|
127 | $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src);
|
---|
128 | pos($$rsrc) = $ini_pos;
|
---|
129 | return ''; # terminator not found
|
---|
130 | }
|
---|
131 |
|
---|
132 |
|
---|
133 | sub encode($$;$)
|
---|
134 | {
|
---|
135 | my ($obj,$str,$chk) = @_;
|
---|
136 |
|
---|
137 | my $GB = Encode::find_encoding('gb2312-raw');
|
---|
138 | my $ret = '';
|
---|
139 | my $in_ascii = 1; # default mode is ASCII.
|
---|
140 |
|
---|
141 | no warnings 'utf8'; # $str may be malformed UTF8 at the end of a chunk.
|
---|
142 |
|
---|
143 | while (length $str) {
|
---|
144 | if ($str =~ s/^([[:ascii:]]+)//) {
|
---|
145 | my $tmp = $1;
|
---|
146 | $tmp =~ s/~/~~/g; # escapes tildes
|
---|
147 | if (! $in_ascii) {
|
---|
148 | $ret .= "\x7E\x7D"; # '~}'
|
---|
149 | $in_ascii = 1;
|
---|
150 | }
|
---|
151 | $ret .= pack 'a*', $tmp; # remove UTF8 flag.
|
---|
152 | }
|
---|
153 | elsif ($str =~ s/(.)//) {
|
---|
154 | my $s = $1;
|
---|
155 | my $tmp = $GB->encode($s, $chk);
|
---|
156 | last if !defined $tmp;
|
---|
157 | if (length $tmp == 2) { # maybe a valid GB char (XXX)
|
---|
158 | if ($in_ascii) {
|
---|
159 | $ret .= "\x7E\x7B"; # '~{'
|
---|
160 | $in_ascii = 0;
|
---|
161 | }
|
---|
162 | $ret .= $tmp;
|
---|
163 | }
|
---|
164 | elsif (length $tmp) { # maybe FALLBACK in ASCII (XXX)
|
---|
165 | if (!$in_ascii) {
|
---|
166 | $ret .= "\x7E\x7D"; # '~}'
|
---|
167 | $in_ascii = 1;
|
---|
168 | }
|
---|
169 | $ret .= $tmp;
|
---|
170 | }
|
---|
171 | }
|
---|
172 | else { # if $str is malformed UTF8 *and* if length $str != 0.
|
---|
173 | last;
|
---|
174 | }
|
---|
175 | }
|
---|
176 | $_[1] = $str if $chk;
|
---|
177 |
|
---|
178 | # The state at the end of the chunk is discarded, even if in GB mode.
|
---|
179 | # That results in the combination of GB-OUT and GB-IN, i.e. "~}~{".
|
---|
180 | # Parhaps it is harmless, but further investigations may be required...
|
---|
181 |
|
---|
182 | if (! $in_ascii) {
|
---|
183 | $ret .= "\x7E\x7D"; # '~}'
|
---|
184 | $in_ascii = 1;
|
---|
185 | }
|
---|
186 | return $ret;
|
---|
187 | }
|
---|
188 |
|
---|
189 | 1;
|
---|
190 | __END__
|
---|
191 |
|
---|
192 | =head1 NAME
|
---|
193 |
|
---|
194 | Encode::CN::HZ -- internally used by Encode::CN
|
---|
195 |
|
---|
196 | =cut
|
---|