[27174] | 1 | package URI::Escape;
|
---|
| 2 | use strict;
|
---|
| 3 |
|
---|
| 4 | =head1 NAME
|
---|
| 5 |
|
---|
| 6 | URI::Escape - Percent-encode and percent-decode unsafe characters
|
---|
| 7 |
|
---|
| 8 | =head1 SYNOPSIS
|
---|
| 9 |
|
---|
| 10 | use URI::Escape;
|
---|
| 11 | $safe = uri_escape("10% is enough\n");
|
---|
| 12 | $verysafe = uri_escape("foo", "\0-\377");
|
---|
| 13 | $str = uri_unescape($safe);
|
---|
| 14 |
|
---|
| 15 | =head1 DESCRIPTION
|
---|
| 16 |
|
---|
| 17 | This module provides functions to percent-encode and percent-decode URI strings as
|
---|
| 18 | defined by RFC 3986. Percent-encoding URI's is informally called "URI escaping".
|
---|
| 19 | This is the terminology used by this module, which predates the formalization of the
|
---|
| 20 | terms by the RFC by several years.
|
---|
| 21 |
|
---|
| 22 | A URI consists of a restricted set of characters. The restricted set
|
---|
| 23 | of characters consists of digits, letters, and a few graphic symbols
|
---|
| 24 | chosen from those common to most of the character encodings and input
|
---|
| 25 | facilities available to Internet users. They are made up of the
|
---|
| 26 | "unreserved" and "reserved" character sets as defined in RFC 3986.
|
---|
| 27 |
|
---|
| 28 | unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
|
---|
| 29 | reserved = ":" / "/" / "?" / "#" / "[" / "]" / "@"
|
---|
| 30 | "!" / "$" / "&" / "'" / "(" / ")"
|
---|
| 31 | / "*" / "+" / "," / ";" / "="
|
---|
| 32 |
|
---|
| 33 | In addition, any byte (octet) can be represented in a URI by an escape
|
---|
| 34 | sequence: a triplet consisting of the character "%" followed by two
|
---|
| 35 | hexadecimal digits. A byte can also be represented directly by a
|
---|
| 36 | character, using the US-ASCII character for that octet.
|
---|
| 37 |
|
---|
| 38 | Some of the characters are I<reserved> for use as delimiters or as
|
---|
| 39 | part of certain URI components. These must be escaped if they are to
|
---|
| 40 | be treated as ordinary data. Read RFC 3986 for further details.
|
---|
| 41 |
|
---|
| 42 | The functions provided (and exported by default) from this module are:
|
---|
| 43 |
|
---|
| 44 | =over 4
|
---|
| 45 |
|
---|
| 46 | =item uri_escape( $string )
|
---|
| 47 |
|
---|
| 48 | =item uri_escape( $string, $unsafe )
|
---|
| 49 |
|
---|
| 50 | Replaces each unsafe character in the $string with the corresponding
|
---|
| 51 | escape sequence and returns the result. The $string argument should
|
---|
| 52 | be a string of bytes. The uri_escape() function will croak if given a
|
---|
| 53 | characters with code above 255. Use uri_escape_utf8() if you know you
|
---|
| 54 | have such chars or/and want chars in the 128 .. 255 range treated as
|
---|
| 55 | UTF-8.
|
---|
| 56 |
|
---|
| 57 | The uri_escape() function takes an optional second argument that
|
---|
| 58 | overrides the set of characters that are to be escaped. The set is
|
---|
| 59 | specified as a string that can be used in a regular expression
|
---|
| 60 | character class (between [ ]). E.g.:
|
---|
| 61 |
|
---|
| 62 | "\x00-\x1f\x7f-\xff" # all control and hi-bit characters
|
---|
| 63 | "a-z" # all lower case characters
|
---|
| 64 | "^A-Za-z" # everything not a letter
|
---|
| 65 |
|
---|
| 66 | The default set of characters to be escaped is all those which are
|
---|
| 67 | I<not> part of the C<unreserved> character class shown above as well
|
---|
| 68 | as the reserved characters. I.e. the default is:
|
---|
| 69 |
|
---|
| 70 | "^A-Za-z0-9\-\._~"
|
---|
| 71 |
|
---|
| 72 | =item uri_escape_utf8( $string )
|
---|
| 73 |
|
---|
| 74 | =item uri_escape_utf8( $string, $unsafe )
|
---|
| 75 |
|
---|
| 76 | Works like uri_escape(), but will encode chars as UTF-8 before
|
---|
| 77 | escaping them. This makes this function able to deal with characters
|
---|
| 78 | with code above 255 in $string. Note that chars in the 128 .. 255
|
---|
| 79 | range will be escaped differently by this function compared to what
|
---|
| 80 | uri_escape() would. For chars in the 0 .. 127 range there is no
|
---|
| 81 | difference.
|
---|
| 82 |
|
---|
| 83 | Equivalent to:
|
---|
| 84 |
|
---|
| 85 | utf8::encode($string);
|
---|
| 86 | my $uri = uri_escape($string);
|
---|
| 87 |
|
---|
| 88 | Note: JavaScript has a function called escape() that produces the
|
---|
| 89 | sequence "%uXXXX" for chars in the 256 .. 65535 range. This function
|
---|
| 90 | has really nothing to do with URI escaping but some folks got confused
|
---|
| 91 | since it "does the right thing" in the 0 .. 255 range. Because of
|
---|
| 92 | this you sometimes see "URIs" with these kind of escapes. The
|
---|
| 93 | JavaScript encodeURIComponent() function is similar to uri_escape_utf8().
|
---|
| 94 |
|
---|
| 95 | =item uri_unescape($string,...)
|
---|
| 96 |
|
---|
| 97 | Returns a string with each %XX sequence replaced with the actual byte
|
---|
| 98 | (octet).
|
---|
| 99 |
|
---|
| 100 | This does the same as:
|
---|
| 101 |
|
---|
| 102 | $string =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
|
---|
| 103 |
|
---|
| 104 | but does not modify the string in-place as this RE would. Using the
|
---|
| 105 | uri_unescape() function instead of the RE might make the code look
|
---|
| 106 | cleaner and is a few characters less to type.
|
---|
| 107 |
|
---|
| 108 | In a simple benchmark test I did,
|
---|
| 109 | calling the function (instead of the inline RE above) if a few chars
|
---|
| 110 | were unescaped was something like 40% slower, and something like 700% slower if none were. If
|
---|
| 111 | you are going to unescape a lot of times it might be a good idea to
|
---|
| 112 | inline the RE.
|
---|
| 113 |
|
---|
| 114 | If the uri_unescape() function is passed multiple strings, then each
|
---|
| 115 | one is returned unescaped.
|
---|
| 116 |
|
---|
| 117 | =back
|
---|
| 118 |
|
---|
| 119 | The module can also export the C<%escapes> hash, which contains the
|
---|
| 120 | mapping from all 256 bytes to the corresponding escape codes. Lookup
|
---|
| 121 | in this hash is faster than evaluating C<sprintf("%%%02X", ord($byte))>
|
---|
| 122 | each time.
|
---|
| 123 |
|
---|
| 124 | =head1 SEE ALSO
|
---|
| 125 |
|
---|
| 126 | L<URI>
|
---|
| 127 |
|
---|
| 128 |
|
---|
| 129 | =head1 COPYRIGHT
|
---|
| 130 |
|
---|
| 131 | Copyright 1995-2004 Gisle Aas.
|
---|
| 132 |
|
---|
| 133 | This program is free software; you can redistribute it and/or modify
|
---|
| 134 | it under the same terms as Perl itself.
|
---|
| 135 |
|
---|
| 136 | =cut
|
---|
| 137 |
|
---|
| 138 | require Exporter;
|
---|
| 139 | our @ISA = qw(Exporter);
|
---|
| 140 | our %escapes;
|
---|
| 141 | our @EXPORT = qw(uri_escape uri_unescape uri_escape_utf8);
|
---|
| 142 | our @EXPORT_OK = qw(%escapes);
|
---|
| 143 | our $VERSION = "3.31";
|
---|
| 144 |
|
---|
| 145 | use Carp ();
|
---|
| 146 |
|
---|
| 147 | # Build a char->hex map
|
---|
| 148 | for (0..255) {
|
---|
| 149 | $escapes{chr($_)} = sprintf("%%%02X", $_);
|
---|
| 150 | }
|
---|
| 151 |
|
---|
| 152 | my %subst; # compiled patterns
|
---|
| 153 |
|
---|
| 154 | my %Unsafe = (
|
---|
| 155 | RFC2732 => qr/[^A-Za-z0-9\-_.!~*'()]/,
|
---|
| 156 | RFC3986 => qr/[^A-Za-z0-9\-\._~]/,
|
---|
| 157 | );
|
---|
| 158 |
|
---|
| 159 | sub uri_escape {
|
---|
| 160 | my($text, $patn) = @_;
|
---|
| 161 | return undef unless defined $text;
|
---|
| 162 | if (defined $patn){
|
---|
| 163 | unless (exists $subst{$patn}) {
|
---|
| 164 | # Because we can't compile the regex we fake it with a cached sub
|
---|
| 165 | (my $tmp = $patn) =~ s,/,\\/,g;
|
---|
| 166 | eval "\$subst{\$patn} = sub {\$_[0] =~ s/([$tmp])/\$escapes{\$1} || _fail_hi(\$1)/ge; }";
|
---|
| 167 | Carp::croak("uri_escape: $@") if $@;
|
---|
| 168 | }
|
---|
| 169 | &{$subst{$patn}}($text);
|
---|
| 170 | } else {
|
---|
| 171 | $text =~ s/($Unsafe{RFC3986})/$escapes{$1} || _fail_hi($1)/ge;
|
---|
| 172 | }
|
---|
| 173 | $text;
|
---|
| 174 | }
|
---|
| 175 |
|
---|
| 176 | sub _fail_hi {
|
---|
| 177 | my $chr = shift;
|
---|
| 178 | Carp::croak(sprintf "Can't escape \\x{%04X}, try uri_escape_utf8() instead", ord($chr));
|
---|
| 179 | }
|
---|
| 180 |
|
---|
| 181 | sub uri_escape_utf8 {
|
---|
| 182 | my $text = shift;
|
---|
| 183 | utf8::encode($text);
|
---|
| 184 | return uri_escape($text, @_);
|
---|
| 185 | }
|
---|
| 186 |
|
---|
| 187 | sub uri_unescape {
|
---|
| 188 | # Note from RFC1630: "Sequences which start with a percent sign
|
---|
| 189 | # but are not followed by two hexadecimal characters are reserved
|
---|
| 190 | # for future extension"
|
---|
| 191 | my $str = shift;
|
---|
| 192 | if (@_ && wantarray) {
|
---|
| 193 | # not executed for the common case of a single argument
|
---|
| 194 | my @str = ($str, @_); # need to copy
|
---|
| 195 | for (@str) {
|
---|
| 196 | s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
|
---|
| 197 | }
|
---|
| 198 | return @str;
|
---|
| 199 | }
|
---|
| 200 | $str =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg if defined $str;
|
---|
| 201 | $str;
|
---|
| 202 | }
|
---|
| 203 |
|
---|
[31781] | 204 | # https://rt.cpan.org/Public/Bug/Display.html?id=105109
|
---|
| 205 | # https://github.com/libwww-perl/uri/commit/7e03cdd3e5d25df5556a36dbfcd5d6cbcd676afb
|
---|
[31812] | 206 | # XXX FIXME escape_char is buggy as it assigns meaning to the string's storage format.
|
---|
[27174] | 207 | sub escape_char {
|
---|
[31781] | 208 |
|
---|
| 209 | # to fix new "\C is deprecated in regex" warning in 5.21.2 (RT#96941)
|
---|
| 210 | #return join '', @URI::Escape::escapes{$_[0] =~ /(\C)/g};
|
---|
[31812] | 211 |
|
---|
| 212 | # Old versions of utf8::is_utf8() didn't properly handle magical vars (e.g. $1).
|
---|
| 213 | # The following forces a fetch to occur beforehand.
|
---|
| 214 | my $dummy = substr($_[0], 0, 0);
|
---|
| 215 |
|
---|
[31781] | 216 | if (utf8::is_utf8($_[0])) {
|
---|
[31812] | 217 | my $s = shift;
|
---|
[31781] | 218 | utf8::encode($s);
|
---|
| 219 | unshift(@_, $s);
|
---|
| 220 | }
|
---|
[31812] | 221 |
|
---|
| 222 | return join '', @URI::Escape::escapes{split //, $_[0]};
|
---|
[27174] | 223 | }
|
---|
| 224 |
|
---|
| 225 | 1;
|
---|