1 | #
|
---|
2 | # $Id: UTF7.pm,v 2.1 2004/05/25 16:27:14 dankogai Exp $
|
---|
3 | #
|
---|
4 | package Encode::Unicode::UTF7;
|
---|
5 | use strict;
|
---|
6 | no warnings 'redefine';
|
---|
7 | use base qw(Encode::Encoding);
|
---|
8 | __PACKAGE__->Define('UTF-7');
|
---|
9 | our $VERSION = do { my @r = (q$Revision: 2.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
|
---|
10 | use MIME::Base64;
|
---|
11 | use Encode;
|
---|
12 |
|
---|
13 | #
|
---|
14 | # Algorithms taken from Unicode::String by Gisle Aas
|
---|
15 | #
|
---|
16 |
|
---|
17 | our $OPTIONAL_DIRECT_CHARS = 1;
|
---|
18 | my $specials = quotemeta "\'(),-./:?";
|
---|
19 | $OPTIONAL_DIRECT_CHARS and
|
---|
20 | $specials .= quotemeta "!\"#$%&*;<=>@[]^_`{|}";
|
---|
21 | # \s will not work because it matches U+3000 DEOGRAPHIC SPACE
|
---|
22 | # We use qr/[\n\r\t\ ] instead
|
---|
23 | my $re_asis = qr/(?:[\n\r\t\ A-Za-z0-9$specials])/;
|
---|
24 | my $re_encoded = qr/(?:[^\n\r\t\ A-Za-z0-9$specials])/;
|
---|
25 | my $e_utf16 = find_encoding("UTF-16BE");
|
---|
26 |
|
---|
27 | sub needs_lines { 1 };
|
---|
28 |
|
---|
29 | sub encode($$;$){
|
---|
30 | my ($obj, $str, $chk) = @_;
|
---|
31 | my $len = length($str);
|
---|
32 | pos($str) = 0;
|
---|
33 | my $bytes = '';
|
---|
34 | while (pos($str) < $len){
|
---|
35 | if ($str =~ /\G($re_asis+)/ogc){
|
---|
36 | $bytes .= $1;
|
---|
37 | }elsif($str =~ /\G($re_encoded+)/ogsc){
|
---|
38 | if ($1 eq "+"){
|
---|
39 | $bytes .= "+-";
|
---|
40 | }else{
|
---|
41 | my $s = $1;
|
---|
42 | my $base64 = encode_base64($e_utf16->encode($s), '');
|
---|
43 | $base64 =~ s/=+$//;
|
---|
44 | $bytes .= "+$base64-";
|
---|
45 | }
|
---|
46 | }else{
|
---|
47 | die "This should not happen! (pos=" . pos($str) . ")";
|
---|
48 | }
|
---|
49 | }
|
---|
50 | $_[1] = '' if $chk;
|
---|
51 | return $bytes;
|
---|
52 | }
|
---|
53 |
|
---|
54 | sub decode{
|
---|
55 | my ($obj, $bytes, $chk) = @_;
|
---|
56 | my $len = length($bytes);
|
---|
57 | my $str = "";
|
---|
58 | while (pos($bytes) < $len) {
|
---|
59 | if ($bytes =~ /\G([^+]+)/ogc) {
|
---|
60 | $str .= $1;
|
---|
61 | }elsif($bytes =~ /\G\+-/ogc) {
|
---|
62 | $str .= "+";
|
---|
63 | }elsif($bytes =~ /\G\+([A-Za-z0-9+\/]+)-?/ogsc) {
|
---|
64 | my $base64 = $1;
|
---|
65 | my $pad = length($base64) % 4;
|
---|
66 | $base64 .= "=" x (4 - $pad) if $pad;
|
---|
67 | $str .= $e_utf16->decode(decode_base64($base64));
|
---|
68 | }elsif($bytes =~ /\G\+/ogc) {
|
---|
69 | $^W and warn "Bad UTF7 data escape";
|
---|
70 | $str .= "+";
|
---|
71 | }else{
|
---|
72 | die "This should not happen " . pos($bytes);
|
---|
73 | }
|
---|
74 | }
|
---|
75 | $_[1] = '' if $chk;
|
---|
76 | return $str;
|
---|
77 | }
|
---|
78 | 1;
|
---|
79 | __END__
|
---|
80 |
|
---|
81 | =head1 NAME
|
---|
82 |
|
---|
83 | Encode::Unicode::UTF7 -- UTF-7 encoding
|
---|
84 |
|
---|
85 | =head1 SYNOPSIS
|
---|
86 |
|
---|
87 | use Encode qw/encode decode/;
|
---|
88 | $utf7 = encode("UTF-7", $utf8);
|
---|
89 | $utf8 = decode("UTF-7", $ucs2);
|
---|
90 |
|
---|
91 | =head1 ABSTRACT
|
---|
92 |
|
---|
93 | This module implements UTF-7 encoding documented in RFC 2152. UTF-7,
|
---|
94 | as its name suggests, is a 7-bit re-encoded version of UTF-16BE. It
|
---|
95 | is designed to be MTA-safe and expected to be a standard way to
|
---|
96 | exchange Unicoded mails via mails. But with the advent of UTF-8 and
|
---|
97 | 8-bit compliant MTAs, UTF-7 is hardly ever used.
|
---|
98 |
|
---|
99 | UTF-7 was not supported by Encode until version 1.95 because of that.
|
---|
100 | But Unicode::String, a module by Gisle Aas which adds Unicode supports
|
---|
101 | to non-utf8-savvy perl did support UTF-7, the UTF-7 support was added
|
---|
102 | so Encode can supersede Unicode::String 100%.
|
---|
103 |
|
---|
104 | =head1 In Practice
|
---|
105 |
|
---|
106 | When you want to encode Unicode for mails and web pages, however, do
|
---|
107 | not use UTF-7 unless you are sure your recipients and readers can
|
---|
108 | handle it. Very few MUAs and WWW Browsers support these days (only
|
---|
109 | Mozilla seems to support one). For general cases, use UTF-8 for
|
---|
110 | message body and MIME-Header for header instead.
|
---|
111 |
|
---|
112 | =head1 SEE ALSO
|
---|
113 |
|
---|
114 | L<Encode>, L<Encode::Unicode>, L<Unicode::String>
|
---|
115 |
|
---|
116 | RFC 2781 L<http://www.ietf.org/rfc/rfc2152.txt>
|
---|
117 |
|
---|
118 | =cut
|
---|