source: main/trunk/greenstone2/perllib/cpan/URI/Escape.pm@ 27174

Last change on this file since 27174 was 27174, checked in by davidb, 11 years ago

Perl modules from CPAN that are used in supporting activate.pl, but not part of the Perl core. Only PMs included.

File size: 6.5 KB
Line 
1package URI::Escape;
2use strict;
3
4=head1 NAME
5
6URI::Escape - Percent-encode and percent-decode unsafe characters
7
8=head1 SYNOPSIS
9
10 use URI::Escape;
11 $safe = uri_escape("10% is enough\n");
12 $verysafe = uri_escape("foo", "\0-\377");
13 $str = uri_unescape($safe);
14
15=head1 DESCRIPTION
16
17This module provides functions to percent-encode and percent-decode URI strings as
18defined by RFC 3986. Percent-encoding URI's is informally called "URI escaping".
19This is the terminology used by this module, which predates the formalization of the
20terms by the RFC by several years.
21
22A URI consists of a restricted set of characters. The restricted set
23of characters consists of digits, letters, and a few graphic symbols
24chosen from those common to most of the character encodings and input
25facilities available to Internet users. They are made up of the
26"unreserved" and "reserved" character sets as defined in RFC 3986.
27
28 unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
29 reserved = ":" / "/" / "?" / "#" / "[" / "]" / "@"
30 "!" / "$" / "&" / "'" / "(" / ")"
31 / "*" / "+" / "," / ";" / "="
32
33In addition, any byte (octet) can be represented in a URI by an escape
34sequence: a triplet consisting of the character "%" followed by two
35hexadecimal digits. A byte can also be represented directly by a
36character, using the US-ASCII character for that octet.
37
38Some of the characters are I<reserved> for use as delimiters or as
39part of certain URI components. These must be escaped if they are to
40be treated as ordinary data. Read RFC 3986 for further details.
41
42The functions provided (and exported by default) from this module are:
43
44=over 4
45
46=item uri_escape( $string )
47
48=item uri_escape( $string, $unsafe )
49
50Replaces each unsafe character in the $string with the corresponding
51escape sequence and returns the result. The $string argument should
52be a string of bytes. The uri_escape() function will croak if given a
53characters with code above 255. Use uri_escape_utf8() if you know you
54have such chars or/and want chars in the 128 .. 255 range treated as
55UTF-8.
56
57The uri_escape() function takes an optional second argument that
58overrides the set of characters that are to be escaped. The set is
59specified as a string that can be used in a regular expression
60character class (between [ ]). E.g.:
61
62 "\x00-\x1f\x7f-\xff" # all control and hi-bit characters
63 "a-z" # all lower case characters
64 "^A-Za-z" # everything not a letter
65
66The default set of characters to be escaped is all those which are
67I<not> part of the C<unreserved> character class shown above as well
68as the reserved characters. I.e. the default is:
69
70 "^A-Za-z0-9\-\._~"
71
72=item uri_escape_utf8( $string )
73
74=item uri_escape_utf8( $string, $unsafe )
75
76Works like uri_escape(), but will encode chars as UTF-8 before
77escaping them. This makes this function able to deal with characters
78with code above 255 in $string. Note that chars in the 128 .. 255
79range will be escaped differently by this function compared to what
80uri_escape() would. For chars in the 0 .. 127 range there is no
81difference.
82
83Equivalent to:
84
85 utf8::encode($string);
86 my $uri = uri_escape($string);
87
88Note: JavaScript has a function called escape() that produces the
89sequence "%uXXXX" for chars in the 256 .. 65535 range. This function
90has really nothing to do with URI escaping but some folks got confused
91since it "does the right thing" in the 0 .. 255 range. Because of
92this you sometimes see "URIs" with these kind of escapes. The
93JavaScript encodeURIComponent() function is similar to uri_escape_utf8().
94
95=item uri_unescape($string,...)
96
97Returns a string with each %XX sequence replaced with the actual byte
98(octet).
99
100This does the same as:
101
102 $string =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
103
104but does not modify the string in-place as this RE would. Using the
105uri_unescape() function instead of the RE might make the code look
106cleaner and is a few characters less to type.
107
108In a simple benchmark test I did,
109calling the function (instead of the inline RE above) if a few chars
110were unescaped was something like 40% slower, and something like 700% slower if none were. If
111you are going to unescape a lot of times it might be a good idea to
112inline the RE.
113
114If the uri_unescape() function is passed multiple strings, then each
115one is returned unescaped.
116
117=back
118
119The module can also export the C<%escapes> hash, which contains the
120mapping from all 256 bytes to the corresponding escape codes. Lookup
121in this hash is faster than evaluating C<sprintf("%%%02X", ord($byte))>
122each time.
123
124=head1 SEE ALSO
125
126L<URI>
127
128
129=head1 COPYRIGHT
130
131Copyright 1995-2004 Gisle Aas.
132
133This program is free software; you can redistribute it and/or modify
134it under the same terms as Perl itself.
135
136=cut
137
138require Exporter;
139our @ISA = qw(Exporter);
140our %escapes;
141our @EXPORT = qw(uri_escape uri_unescape uri_escape_utf8);
142our @EXPORT_OK = qw(%escapes);
143our $VERSION = "3.31";
144
145use Carp ();
146
147# Build a char->hex map
148for (0..255) {
149 $escapes{chr($_)} = sprintf("%%%02X", $_);
150}
151
152my %subst; # compiled patterns
153
154my %Unsafe = (
155 RFC2732 => qr/[^A-Za-z0-9\-_.!~*'()]/,
156 RFC3986 => qr/[^A-Za-z0-9\-\._~]/,
157);
158
159sub uri_escape {
160 my($text, $patn) = @_;
161 return undef unless defined $text;
162 if (defined $patn){
163 unless (exists $subst{$patn}) {
164 # Because we can't compile the regex we fake it with a cached sub
165 (my $tmp = $patn) =~ s,/,\\/,g;
166 eval "\$subst{\$patn} = sub {\$_[0] =~ s/([$tmp])/\$escapes{\$1} || _fail_hi(\$1)/ge; }";
167 Carp::croak("uri_escape: $@") if $@;
168 }
169 &{$subst{$patn}}($text);
170 } else {
171 $text =~ s/($Unsafe{RFC3986})/$escapes{$1} || _fail_hi($1)/ge;
172 }
173 $text;
174}
175
176sub _fail_hi {
177 my $chr = shift;
178 Carp::croak(sprintf "Can't escape \\x{%04X}, try uri_escape_utf8() instead", ord($chr));
179}
180
181sub uri_escape_utf8 {
182 my $text = shift;
183 utf8::encode($text);
184 return uri_escape($text, @_);
185}
186
187sub uri_unescape {
188 # Note from RFC1630: "Sequences which start with a percent sign
189 # but are not followed by two hexadecimal characters are reserved
190 # for future extension"
191 my $str = shift;
192 if (@_ && wantarray) {
193 # not executed for the common case of a single argument
194 my @str = ($str, @_); # need to copy
195 for (@str) {
196 s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
197 }
198 return @str;
199 }
200 $str =~ s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg if defined $str;
201 $str;
202}
203
204sub escape_char {
205 return join '', @URI::Escape::escapes{$_[0] =~ /(\C)/g};
206}
207
2081;
Note: See TracBrowser for help on using the repository browser.