1 | package Encode::Encoding;
|
---|
2 | # Base class for classes which implement encodings
|
---|
3 | use strict;
|
---|
4 | our $VERSION = do { my @r = (q$Revision: 2.2 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
|
---|
5 |
|
---|
6 | require Encode;
|
---|
7 |
|
---|
8 | sub DEBUG { 0 }
|
---|
9 | sub Define
|
---|
10 | {
|
---|
11 | my $obj = shift;
|
---|
12 | my $canonical = shift;
|
---|
13 | $obj = bless { Name => $canonical },$obj unless ref $obj;
|
---|
14 | # warn "$canonical => $obj\n";
|
---|
15 | Encode::define_encoding($obj, $canonical, @_);
|
---|
16 | }
|
---|
17 |
|
---|
18 | sub name { return shift->{'Name'} }
|
---|
19 |
|
---|
20 | # sub renew { return $_[0] }
|
---|
21 |
|
---|
22 | sub renew {
|
---|
23 | my $self = shift;
|
---|
24 | my $clone = bless { %$self } => ref($self);
|
---|
25 | $clone->{renewed}++; # so the caller can see it
|
---|
26 | DEBUG and warn $clone->{renewed};
|
---|
27 | return $clone;
|
---|
28 | }
|
---|
29 |
|
---|
30 | sub renewed{ return $_[0]->{renewed} || 0 }
|
---|
31 |
|
---|
32 | *new_sequence = \&renew;
|
---|
33 |
|
---|
34 | sub needs_lines { 0 };
|
---|
35 |
|
---|
36 | sub perlio_ok {
|
---|
37 | eval{ require PerlIO::encoding };
|
---|
38 | return $@ ? 0 : 1;
|
---|
39 | }
|
---|
40 |
|
---|
41 | # (Temporary|legacy) methods
|
---|
42 |
|
---|
43 | sub toUnicode { shift->decode(@_) }
|
---|
44 | sub fromUnicode { shift->encode(@_) }
|
---|
45 |
|
---|
46 | #
|
---|
47 | # Needs to be overloaded or just croak
|
---|
48 | #
|
---|
49 |
|
---|
50 | sub encode {
|
---|
51 | require Carp;
|
---|
52 | my $obj = shift;
|
---|
53 | my $class = ref($obj) ? ref($obj) : $obj;
|
---|
54 | Carp::croak($class . "->encode() not defined!");
|
---|
55 | }
|
---|
56 |
|
---|
57 | sub decode{
|
---|
58 | require Carp;
|
---|
59 | my $obj = shift;
|
---|
60 | my $class = ref($obj) ? ref($obj) : $obj;
|
---|
61 | Carp::croak($class . "->encode() not defined!");
|
---|
62 | }
|
---|
63 |
|
---|
64 | sub DESTROY {}
|
---|
65 |
|
---|
66 | 1;
|
---|
67 | __END__
|
---|
68 |
|
---|
69 | =head1 NAME
|
---|
70 |
|
---|
71 | Encode::Encoding - Encode Implementation Base Class
|
---|
72 |
|
---|
73 | =head1 SYNOPSIS
|
---|
74 |
|
---|
75 | package Encode::MyEncoding;
|
---|
76 | use base qw(Encode::Encoding);
|
---|
77 |
|
---|
78 | __PACKAGE__->Define(qw(myCanonical myAlias));
|
---|
79 |
|
---|
80 | =head1 DESCRIPTION
|
---|
81 |
|
---|
82 | As mentioned in L<Encode>, encodings are (in the current
|
---|
83 | implementation at least) defined as objects. The mapping of encoding
|
---|
84 | name to object is via the C<%Encode::Encoding> hash. Though you can
|
---|
85 | directly manipulate this hash, it is strongly encouraged to use this
|
---|
86 | base class module and add encode() and decode() methods.
|
---|
87 |
|
---|
88 | =head2 Methods you should implement
|
---|
89 |
|
---|
90 | You are strongly encouraged to implement methods below, at least
|
---|
91 | either encode() or decode().
|
---|
92 |
|
---|
93 | =over 4
|
---|
94 |
|
---|
95 | =item -E<gt>encode($string [,$check])
|
---|
96 |
|
---|
97 | MUST return the octet sequence representing I<$string>.
|
---|
98 |
|
---|
99 | =over 2
|
---|
100 |
|
---|
101 | =item *
|
---|
102 |
|
---|
103 | If I<$check> is true, it SHOULD modify I<$string> in place to remove
|
---|
104 | the converted part (i.e. the whole string unless there is an error).
|
---|
105 | If perlio_ok() is true, SHOULD becomes MUST.
|
---|
106 |
|
---|
107 | =item *
|
---|
108 |
|
---|
109 | If an error occurs, it SHOULD return the octet sequence for the
|
---|
110 | fragment of string that has been converted and modify $string in-place
|
---|
111 | to remove the converted part leaving it starting with the problem
|
---|
112 | fragment. If perlio_ok() is true, SHOULD becomes MUST.
|
---|
113 |
|
---|
114 | =item *
|
---|
115 |
|
---|
116 | If I<$check> is is false then C<encode> MUST make a "best effort" to
|
---|
117 | convert the string - for example, by using a replacement character.
|
---|
118 |
|
---|
119 | =back
|
---|
120 |
|
---|
121 | =item -E<gt>decode($octets [,$check])
|
---|
122 |
|
---|
123 | MUST return the string that I<$octets> represents.
|
---|
124 |
|
---|
125 | =over 2
|
---|
126 |
|
---|
127 | =item *
|
---|
128 |
|
---|
129 | If I<$check> is true, it SHOULD modify I<$octets> in place to remove
|
---|
130 | the converted part (i.e. the whole sequence unless there is an
|
---|
131 | error). If perlio_ok() is true, SHOULD becomes MUST.
|
---|
132 |
|
---|
133 | =item *
|
---|
134 |
|
---|
135 | If an error occurs, it SHOULD return the fragment of string that has
|
---|
136 | been converted and modify $octets in-place to remove the converted
|
---|
137 | part leaving it starting with the problem fragment. If perlio_ok() is
|
---|
138 | true, SHOULD becomes MUST.
|
---|
139 |
|
---|
140 | =item *
|
---|
141 |
|
---|
142 | If I<$check> is false then C<decode> should make a "best effort" to
|
---|
143 | convert the string - for example by using Unicode's "\x{FFFD}" as a
|
---|
144 | replacement character.
|
---|
145 |
|
---|
146 | =back
|
---|
147 |
|
---|
148 | =back
|
---|
149 |
|
---|
150 | If you want your encoding to work with L<encoding> pragma, you should
|
---|
151 | also implement the method below.
|
---|
152 |
|
---|
153 | =over 4
|
---|
154 |
|
---|
155 | =item -E<gt>cat_decode($destination, $octets, $offset, $terminator [,$check])
|
---|
156 |
|
---|
157 | MUST decode I<$octets> with I<$offset> and concatenate it to I<$destination>.
|
---|
158 | Decoding will terminate when $terminator (a string) appears in output.
|
---|
159 | I<$offset> will be modified to the last $octets position at end of decode.
|
---|
160 | Returns true if $terminator appears output, else returns false.
|
---|
161 |
|
---|
162 | =back
|
---|
163 |
|
---|
164 | =head2 Other methods defined in Encode::Encodings
|
---|
165 |
|
---|
166 | You do not have to override methods shown below unless you have to.
|
---|
167 |
|
---|
168 | =over 4
|
---|
169 |
|
---|
170 | =item -E<gt>name
|
---|
171 |
|
---|
172 | Predefined As:
|
---|
173 |
|
---|
174 | sub name { return shift->{'Name'} }
|
---|
175 |
|
---|
176 | MUST return the string representing the canonical name of the encoding.
|
---|
177 |
|
---|
178 | =item -E<gt>renew
|
---|
179 |
|
---|
180 | Predefined As:
|
---|
181 |
|
---|
182 | sub renew {
|
---|
183 | my $self = shift;
|
---|
184 | my $clone = bless { %$self } => ref($self);
|
---|
185 | $clone->{renewed}++;
|
---|
186 | return $clone;
|
---|
187 | }
|
---|
188 |
|
---|
189 | This method reconstructs the encoding object if necessary. If you need
|
---|
190 | to store the state during encoding, this is where you clone your object.
|
---|
191 |
|
---|
192 | PerlIO ALWAYS calls this method to make sure it has its own private
|
---|
193 | encoding object.
|
---|
194 |
|
---|
195 | =item -E<gt>renewed
|
---|
196 |
|
---|
197 | Predefined As:
|
---|
198 |
|
---|
199 | sub renewed { $_[0]->{renewed} || 0 }
|
---|
200 |
|
---|
201 | Tells whether the object is renewed (and how many times). Some
|
---|
202 | modules emit C<Use of uninitialized value in null operation> warning
|
---|
203 | unless the value is numeric so return 0 for false.
|
---|
204 |
|
---|
205 | =item -E<gt>perlio_ok()
|
---|
206 |
|
---|
207 | Predefined As:
|
---|
208 |
|
---|
209 | sub perlio_ok {
|
---|
210 | eval{ require PerlIO::encoding };
|
---|
211 | return $@ ? 0 : 1;
|
---|
212 | }
|
---|
213 |
|
---|
214 | If your encoding does not support PerlIO for some reasons, just;
|
---|
215 |
|
---|
216 | sub perlio_ok { 0 }
|
---|
217 |
|
---|
218 | =item -E<gt>needs_lines()
|
---|
219 |
|
---|
220 | Predefined As:
|
---|
221 |
|
---|
222 | sub needs_lines { 0 };
|
---|
223 |
|
---|
224 | If your encoding can work with PerlIO but needs line buffering, you
|
---|
225 | MUST define this method so it returns true. 7bit ISO-2022 encodings
|
---|
226 | are one example that needs this. When this method is missing, false
|
---|
227 | is assumed.
|
---|
228 |
|
---|
229 | =back
|
---|
230 |
|
---|
231 | =head2 Example: Encode::ROT13
|
---|
232 |
|
---|
233 | package Encode::ROT13;
|
---|
234 | use strict;
|
---|
235 | use base qw(Encode::Encoding);
|
---|
236 |
|
---|
237 | __PACKAGE__->Define('rot13');
|
---|
238 |
|
---|
239 | sub encode($$;$){
|
---|
240 | my ($obj, $str, $chk) = @_;
|
---|
241 | $str =~ tr/A-Za-z/N-ZA-Mn-za-m/;
|
---|
242 | $_[1] = '' if $chk; # this is what in-place edit means
|
---|
243 | return $str;
|
---|
244 | }
|
---|
245 |
|
---|
246 | # Jr pna or ynml yvxr guvf;
|
---|
247 | *decode = \&encode;
|
---|
248 |
|
---|
249 | 1;
|
---|
250 |
|
---|
251 | =head1 Why the heck Encode API is different?
|
---|
252 |
|
---|
253 | It should be noted that the I<$check> behaviour is different from the
|
---|
254 | outer public API. The logic is that the "unchecked" case is useful
|
---|
255 | when the encoding is part of a stream which may be reporting errors
|
---|
256 | (e.g. STDERR). In such cases, it is desirable to get everything
|
---|
257 | through somehow without causing additional errors which obscure the
|
---|
258 | original one. Also, the encoding is best placed to know what the
|
---|
259 | correct replacement character is, so if that is the desired behaviour
|
---|
260 | then letting low level code do it is the most efficient.
|
---|
261 |
|
---|
262 | By contrast, if I<$check> is true, the scheme above allows the
|
---|
263 | encoding to do as much as it can and tell the layer above how much
|
---|
264 | that was. What is lacking at present is a mechanism to report what
|
---|
265 | went wrong. The most likely interface will be an additional method
|
---|
266 | call to the object, or perhaps (to avoid forcing per-stream objects
|
---|
267 | on otherwise stateless encodings) an additional parameter.
|
---|
268 |
|
---|
269 | It is also highly desirable that encoding classes inherit from
|
---|
270 | C<Encode::Encoding> as a base class. This allows that class to define
|
---|
271 | additional behaviour for all encoding objects.
|
---|
272 |
|
---|
273 | package Encode::MyEncoding;
|
---|
274 | use base qw(Encode::Encoding);
|
---|
275 |
|
---|
276 | __PACKAGE__->Define(qw(myCanonical myAlias));
|
---|
277 |
|
---|
278 | to create an object with C<< bless {Name => ...}, $class >>, and call
|
---|
279 | define_encoding. They inherit their C<name> method from
|
---|
280 | C<Encode::Encoding>.
|
---|
281 |
|
---|
282 | =head2 Compiled Encodings
|
---|
283 |
|
---|
284 | For the sake of speed and efficiency, most of the encodings are now
|
---|
285 | supported via a I<compiled form>: XS modules generated from UCM
|
---|
286 | files. Encode provides the enc2xs tool to achieve that. Please see
|
---|
287 | L<enc2xs> for more details.
|
---|
288 |
|
---|
289 | =head1 SEE ALSO
|
---|
290 |
|
---|
291 | L<perlmod>, L<enc2xs>
|
---|
292 |
|
---|
293 | =begin future
|
---|
294 |
|
---|
295 | =over 4
|
---|
296 |
|
---|
297 | =item Scheme 1
|
---|
298 |
|
---|
299 | The fixup routine gets passed the remaining fragment of string being
|
---|
300 | processed. It modifies it in place to remove bytes/characters it can
|
---|
301 | understand and returns a string used to represent them. For example:
|
---|
302 |
|
---|
303 | sub fixup {
|
---|
304 | my $ch = substr($_[0],0,1,'');
|
---|
305 | return sprintf("\x{%02X}",ord($ch);
|
---|
306 | }
|
---|
307 |
|
---|
308 | This scheme is close to how the underlying C code for Encode works,
|
---|
309 | but gives the fixup routine very little context.
|
---|
310 |
|
---|
311 | =item Scheme 2
|
---|
312 |
|
---|
313 | The fixup routine gets passed the original string, an index into
|
---|
314 | it of the problem area, and the output string so far. It appends
|
---|
315 | what it wants to the output string and returns a new index into the
|
---|
316 | original string. For example:
|
---|
317 |
|
---|
318 | sub fixup {
|
---|
319 | # my ($s,$i,$d) = @_;
|
---|
320 | my $ch = substr($_[0],$_[1],1);
|
---|
321 | $_[2] .= sprintf("\x{%02X}",ord($ch);
|
---|
322 | return $_[1]+1;
|
---|
323 | }
|
---|
324 |
|
---|
325 | This scheme gives maximal control to the fixup routine but is more
|
---|
326 | complicated to code, and may require that the internals of Encode be tweaked to
|
---|
327 | keep the original string intact.
|
---|
328 |
|
---|
329 | =item Other Schemes
|
---|
330 |
|
---|
331 | Hybrids of the above.
|
---|
332 |
|
---|
333 | Multiple return values rather than in-place modifications.
|
---|
334 |
|
---|
335 | Index into the string could be C<pos($str)> allowing C<s/\G...//>.
|
---|
336 |
|
---|
337 | =back
|
---|
338 |
|
---|
339 | =end future
|
---|
340 |
|
---|
341 | =cut
|
---|