source: main/trunk/greenstone2/perllib/cpan/URI.pm@ 27191

Last change on this file since 27191 was 27191, checked in by davidb, 11 years ago

This should be in the 'cpan' area

File size: 33.0 KB
Line 
1package URI;
2
3use strict;
4use vars qw($VERSION);
5$VERSION = "1.60";
6
7use vars qw($ABS_REMOTE_LEADING_DOTS $ABS_ALLOW_RELATIVE_SCHEME $DEFAULT_QUERY_FORM_DELIMITER);
8
9my %implements; # mapping from scheme to implementor class
10
11# Some "official" character classes
12
13use vars qw($reserved $mark $unreserved $uric $scheme_re);
14$reserved = q(;/?:@&=+$,[]);
15$mark = q(-_.!~*'()); #'; emacs
16$unreserved = "A-Za-z0-9\Q$mark\E";
17$uric = quotemeta($reserved) . $unreserved . "%";
18
19$scheme_re = '[a-zA-Z][a-zA-Z0-9.+\-]*';
20
21use Carp ();
22use URI::Escape ();
23
24use overload ('""' => sub { ${$_[0]} },
25 '==' => sub { _obj_eq(@_) },
26 '!=' => sub { !_obj_eq(@_) },
27 fallback => 1,
28 );
29
30# Check if two objects are the same object
31sub _obj_eq {
32 return overload::StrVal($_[0]) eq overload::StrVal($_[1]);
33}
34
35sub new
36{
37 my($class, $uri, $scheme) = @_;
38
39 $uri = defined ($uri) ? "$uri" : ""; # stringify
40 # Get rid of potential wrapping
41 $uri =~ s/^<(?:URL:)?(.*)>$/$1/; #
42 $uri =~ s/^"(.*)"$/$1/;
43 $uri =~ s/^\s+//;
44 $uri =~ s/\s+$//;
45
46 my $impclass;
47 if ($uri =~ m/^($scheme_re):/so) {
48 $scheme = $1;
49 }
50 else {
51 if (($impclass = ref($scheme))) {
52 $scheme = $scheme->scheme;
53 }
54 elsif ($scheme && $scheme =~ m/^($scheme_re)(?::|$)/o) {
55 $scheme = $1;
56 }
57 }
58 $impclass ||= implementor($scheme) ||
59 do {
60 require URI::_foreign;
61 $impclass = 'URI::_foreign';
62 };
63
64 return $impclass->_init($uri, $scheme);
65}
66
67
68sub new_abs
69{
70 my($class, $uri, $base) = @_;
71 $uri = $class->new($uri, $base);
72 $uri->abs($base);
73}
74
75
76sub _init
77{
78 my $class = shift;
79 my($str, $scheme) = @_;
80 # find all funny characters and encode the bytes.
81 $str = $class->_uric_escape($str);
82 $str = "$scheme:$str" unless $str =~ /^$scheme_re:/o ||
83 $class->_no_scheme_ok;
84 my $self = bless \$str, $class;
85 $self;
86}
87
88
89sub _uric_escape
90{
91 my($class, $str) = @_;
92 $str =~ s*([^$uric\#])* URI::Escape::escape_char($1) *ego;
93 utf8::downgrade($str);
94 return $str;
95}
96
97
98sub implementor
99{
100 my($scheme, $impclass) = @_;
101 if (!$scheme || $scheme !~ /\A$scheme_re\z/o) {
102 require URI::_generic;
103 return "URI::_generic";
104 }
105
106 $scheme = lc($scheme);
107
108 if ($impclass) {
109 # Set the implementor class for a given scheme
110 my $old = $implements{$scheme};
111 $impclass->_init_implementor($scheme);
112 $implements{$scheme} = $impclass;
113 return $old;
114 }
115
116 my $ic = $implements{$scheme};
117 return $ic if $ic;
118
119 # scheme not yet known, look for internal or
120 # preloaded (with 'use') implementation
121 $ic = "URI::$scheme"; # default location
122
123 # turn scheme into a valid perl identifier by a simple transformation...
124 $ic =~ s/\+/_P/g;
125 $ic =~ s/\./_O/g;
126 $ic =~ s/\-/_/g;
127
128 no strict 'refs';
129 # check we actually have one for the scheme:
130 unless (@{"${ic}::ISA"}) {
131 # Try to load it
132 eval "require $ic";
133 die $@ if $@ && $@ !~ /Can\'t locate.*in \@INC/;
134 return unless @{"${ic}::ISA"};
135 }
136
137 $ic->_init_implementor($scheme);
138 $implements{$scheme} = $ic;
139 $ic;
140}
141
142
143sub _init_implementor
144{
145 my($class, $scheme) = @_;
146 # Remember that one implementor class may actually
147 # serve to implement several URI schemes.
148}
149
150
151sub clone
152{
153 my $self = shift;
154 my $other = $$self;
155 bless \$other, ref $self;
156}
157
158
159sub _no_scheme_ok { 0 }
160
161sub _scheme
162{
163 my $self = shift;
164
165 unless (@_) {
166 return unless $$self =~ /^($scheme_re):/o;
167 return $1;
168 }
169
170 my $old;
171 my $new = shift;
172 if (defined($new) && length($new)) {
173 Carp::croak("Bad scheme '$new'") unless $new =~ /^$scheme_re$/o;
174 $old = $1 if $$self =~ s/^($scheme_re)://o;
175 my $newself = URI->new("$new:$$self");
176 $$self = $$newself;
177 bless $self, ref($newself);
178 }
179 else {
180 if ($self->_no_scheme_ok) {
181 $old = $1 if $$self =~ s/^($scheme_re)://o;
182 Carp::carp("Oops, opaque part now look like scheme")
183 if $^W && $$self =~ m/^$scheme_re:/o
184 }
185 else {
186 $old = $1 if $$self =~ m/^($scheme_re):/o;
187 }
188 }
189
190 return $old;
191}
192
193sub scheme
194{
195 my $scheme = shift->_scheme(@_);
196 return unless defined $scheme;
197 lc($scheme);
198}
199
200
201sub opaque
202{
203 my $self = shift;
204
205 unless (@_) {
206 $$self =~ /^(?:$scheme_re:)?([^\#]*)/o or die;
207 return $1;
208 }
209
210 $$self =~ /^($scheme_re:)? # optional scheme
211 ([^\#]*) # opaque
212 (\#.*)? # optional fragment
213 $/sx or die;
214
215 my $old_scheme = $1;
216 my $old_opaque = $2;
217 my $old_frag = $3;
218
219 my $new_opaque = shift;
220 $new_opaque = "" unless defined $new_opaque;
221 $new_opaque =~ s/([^$uric])/ URI::Escape::escape_char($1)/ego;
222 utf8::downgrade($new_opaque);
223
224 $$self = defined($old_scheme) ? $old_scheme : "";
225 $$self .= $new_opaque;
226 $$self .= $old_frag if defined $old_frag;
227
228 $old_opaque;
229}
230
231*path = \&opaque; # alias
232
233
234sub fragment
235{
236 my $self = shift;
237 unless (@_) {
238 return unless $$self =~ /\#(.*)/s;
239 return $1;
240 }
241
242 my $old;
243 $old = $1 if $$self =~ s/\#(.*)//s;
244
245 my $new_frag = shift;
246 if (defined $new_frag) {
247 $new_frag =~ s/([^$uric])/ URI::Escape::escape_char($1) /ego;
248 utf8::downgrade($new_frag);
249 $$self .= "#$new_frag";
250 }
251 $old;
252}
253
254
255sub as_string
256{
257 my $self = shift;
258 $$self;
259}
260
261
262sub as_iri
263{
264 my $self = shift;
265 my $str = $$self;
266 if ($str =~ s/%([89a-fA-F][0-9a-fA-F])/chr(hex($1))/eg) {
267 # All this crap because the more obvious:
268 #
269 # Encode::decode("UTF-8", $str, sub { sprintf "%%%02X", shift })
270 #
271 # doesn't work before Encode 2.39. Wait for a standard release
272 # to bundle that version.
273
274 require Encode;
275 my $enc = Encode::find_encoding("UTF-8");
276 my $u = "";
277 while (length $str) {
278 $u .= $enc->decode($str, Encode::FB_QUIET());
279 if (length $str) {
280 # escape next char
281 $u .= URI::Escape::escape_char(substr($str, 0, 1, ""));
282 }
283 }
284 $str = $u;
285 }
286 return $str;
287}
288
289
290sub canonical
291{
292 # Make sure scheme is lowercased, that we don't escape unreserved chars,
293 # and that we use upcase escape sequences.
294
295 my $self = shift;
296 my $scheme = $self->_scheme || "";
297 my $uc_scheme = $scheme =~ /[A-Z]/;
298 my $esc = $$self =~ /%[a-fA-F0-9]{2}/;
299 return $self unless $uc_scheme || $esc;
300
301 my $other = $self->clone;
302 if ($uc_scheme) {
303 $other->_scheme(lc $scheme);
304 }
305 if ($esc) {
306 $$other =~ s{%([0-9a-fA-F]{2})}
307 { my $a = chr(hex($1));
308 $a =~ /^[$unreserved]\z/o ? $a : "%\U$1"
309 }ge;
310 }
311 return $other;
312}
313
314# Compare two URIs, subclasses will provide a more correct implementation
315sub eq {
316 my($self, $other) = @_;
317 $self = URI->new($self, $other) unless ref $self;
318 $other = URI->new($other, $self) unless ref $other;
319 ref($self) eq ref($other) && # same class
320 $self->canonical->as_string eq $other->canonical->as_string;
321}
322
323# generic-URI transformation methods
324sub abs { $_[0]; }
325sub rel { $_[0]; }
326
327sub secure { 0 }
328
329# help out Storable
330sub STORABLE_freeze {
331 my($self, $cloning) = @_;
332 return $$self;
333}
334
335sub STORABLE_thaw {
336 my($self, $cloning, $str) = @_;
337 $$self = $str;
338}
339
3401;
341
342__END__
343
344=head1 NAME
345
346URI - Uniform Resource Identifiers (absolute and relative)
347
348=head1 SYNOPSIS
349
350 $u1 = URI->new("http://www.perl.com");
351 $u2 = URI->new("foo", "http");
352 $u3 = $u2->abs($u1);
353 $u4 = $u3->clone;
354 $u5 = URI->new("HTTP://WWW.perl.com:80")->canonical;
355
356 $str = $u->as_string;
357 $str = "$u";
358
359 $scheme = $u->scheme;
360 $opaque = $u->opaque;
361 $path = $u->path;
362 $frag = $u->fragment;
363
364 $u->scheme("ftp");
365 $u->host("ftp.perl.com");
366 $u->path("cpan/");
367
368=head1 DESCRIPTION
369
370This module implements the C<URI> class. Objects of this class
371represent "Uniform Resource Identifier references" as specified in RFC
3722396 (and updated by RFC 2732).
373
374A Uniform Resource Identifier is a compact string of characters that
375identifies an abstract or physical resource. A Uniform Resource
376Identifier can be further classified as either a Uniform Resource Locator
377(URL) or a Uniform Resource Name (URN). The distinction between URL
378and URN does not matter to the C<URI> class interface. A
379"URI-reference" is a URI that may have additional information attached
380in the form of a fragment identifier.
381
382An absolute URI reference consists of three parts: a I<scheme>, a
383I<scheme-specific part> and a I<fragment> identifier. A subset of URI
384references share a common syntax for hierarchical namespaces. For
385these, the scheme-specific part is further broken down into
386I<authority>, I<path> and I<query> components. These URIs can also
387take the form of relative URI references, where the scheme (and
388usually also the authority) component is missing, but implied by the
389context of the URI reference. The three forms of URI reference
390syntax are summarized as follows:
391
392 <scheme>:<scheme-specific-part>#<fragment>
393 <scheme>://<authority><path>?<query>#<fragment>
394 <path>?<query>#<fragment>
395
396The components into which a URI reference can be divided depend on the
397I<scheme>. The C<URI> class provides methods to get and set the
398individual components. The methods available for a specific
399C<URI> object depend on the scheme.
400
401=head1 CONSTRUCTORS
402
403The following methods construct new C<URI> objects:
404
405=over 4
406
407=item $uri = URI->new( $str )
408
409=item $uri = URI->new( $str, $scheme )
410
411Constructs a new URI object. The string
412representation of a URI is given as argument, together with an optional
413scheme specification. Common URI wrappers like "" and <>, as well as
414leading and trailing white space, are automatically removed from
415the $str argument before it is processed further.
416
417The constructor determines the scheme, maps this to an appropriate
418URI subclass, constructs a new object of that class and returns it.
419
420The $scheme argument is only used when $str is a
421relative URI. It can be either a simple string that
422denotes the scheme, a string containing an absolute URI reference, or
423an absolute C<URI> object. If no $scheme is specified for a relative
424URI $str, then $str is simply treated as a generic URI (no scheme-specific
425methods available).
426
427The set of characters available for building URI references is
428restricted (see L<URI::Escape>). Characters outside this set are
429automatically escaped by the URI constructor.
430
431=item $uri = URI->new_abs( $str, $base_uri )
432
433Constructs a new absolute URI object. The $str argument can
434denote a relative or absolute URI. If relative, then it is
435absolutized using $base_uri as base. The $base_uri must be an absolute
436URI.
437
438=item $uri = URI::file->new( $filename )
439
440=item $uri = URI::file->new( $filename, $os )
441
442Constructs a new I<file> URI from a file name. See L<URI::file>.
443
444=item $uri = URI::file->new_abs( $filename )
445
446=item $uri = URI::file->new_abs( $filename, $os )
447
448Constructs a new absolute I<file> URI from a file name. See
449L<URI::file>.
450
451=item $uri = URI::file->cwd
452
453Returns the current working directory as a I<file> URI. See
454L<URI::file>.
455
456=item $uri->clone
457
458Returns a copy of the $uri.
459
460=back
461
462=head1 COMMON METHODS
463
464The methods described in this section are available for all C<URI>
465objects.
466
467Methods that give access to components of a URI always return the
468old value of the component. The value returned is C<undef> if the
469component was not present. There is generally a difference between a
470component that is empty (represented as C<"">) and a component that is
471missing (represented as C<undef>). If an accessor method is given an
472argument, it updates the corresponding component in addition to
473returning the old value of the component. Passing an undefined
474argument removes the component (if possible). The description of
475each accessor method indicates whether the component is passed as
476an escaped (percent-encoded) or an unescaped string. A component that can be further
477divided into sub-parts are usually passed escaped, as unescaping might
478change its semantics.
479
480The common methods available for all URI are:
481
482=over 4
483
484=item $uri->scheme
485
486=item $uri->scheme( $new_scheme )
487
488Sets and returns the scheme part of the $uri. If the $uri is
489relative, then $uri->scheme returns C<undef>. If called with an
490argument, it updates the scheme of $uri, possibly changing the
491class of $uri, and returns the old scheme value. The method croaks
492if the new scheme name is illegal; a scheme name must begin with a
493letter and must consist of only US-ASCII letters, numbers, and a few
494special marks: ".", "+", "-". This restriction effectively means
495that the scheme must be passed unescaped. Passing an undefined
496argument to the scheme method makes the URI relative (if possible).
497
498Letter case does not matter for scheme names. The string
499returned by $uri->scheme is always lowercase. If you want the scheme
500just as it was written in the URI in its original case,
501you can use the $uri->_scheme method instead.
502
503=item $uri->opaque
504
505=item $uri->opaque( $new_opaque )
506
507Sets and returns the scheme-specific part of the $uri
508(everything between the scheme and the fragment)
509as an escaped string.
510
511=item $uri->path
512
513=item $uri->path( $new_path )
514
515Sets and returns the same value as $uri->opaque unless the URI
516supports the generic syntax for hierarchical namespaces.
517In that case the generic method is overridden to set and return
518the part of the URI between the I<host name> and the I<fragment>.
519
520=item $uri->fragment
521
522=item $uri->fragment( $new_frag )
523
524Returns the fragment identifier of a URI reference
525as an escaped string.
526
527=item $uri->as_string
528
529Returns a URI object to a plain ASCII string. URI objects are
530also converted to plain strings automatically by overloading. This
531means that $uri objects can be used as plain strings in most Perl
532constructs.
533
534=item $uri->as_iri
535
536Returns a Unicode string representing the URI. Escaped UTF-8 sequences
537representing non-ASCII characters are turned into their corresponding Unicode
538code point.
539
540=item $uri->canonical
541
542Returns a normalized version of the URI. The rules
543for normalization are scheme-dependent. They usually involve
544lowercasing the scheme and Internet host name components,
545removing the explicit port specification if it matches the default port,
546uppercasing all escape sequences, and unescaping octets that can be
547better represented as plain characters.
548
549For efficiency reasons, if the $uri is already in normalized form,
550then a reference to it is returned instead of a copy.
551
552=item $uri->eq( $other_uri )
553
554=item URI::eq( $first_uri, $other_uri )
555
556Tests whether two URI references are equal. URI references
557that normalize to the same string are considered equal. The method
558can also be used as a plain function which can also test two string
559arguments.
560
561If you need to test whether two C<URI> object references denote the
562same object, use the '==' operator.
563
564=item $uri->abs( $base_uri )
565
566Returns an absolute URI reference. If $uri is already
567absolute, then a reference to it is simply returned. If the $uri
568is relative, then a new absolute URI is constructed by combining the
569$uri and the $base_uri, and returned.
570
571=item $uri->rel( $base_uri )
572
573Returns a relative URI reference if it is possible to
574make one that denotes the same resource relative to $base_uri.
575If not, then $uri is simply returned.
576
577=item $uri->secure
578
579Returns a TRUE value if the URI is considered to point to a resource on
580a secure channel, such as an SSL or TLS encrypted one.
581
582=back
583
584=head1 GENERIC METHODS
585
586The following methods are available to schemes that use the
587common/generic syntax for hierarchical namespaces. The descriptions of
588schemes below indicate which these are. Unknown schemes are
589assumed to support the generic syntax, and therefore the following
590methods:
591
592=over 4
593
594=item $uri->authority
595
596=item $uri->authority( $new_authority )
597
598Sets and returns the escaped authority component
599of the $uri.
600
601=item $uri->path
602
603=item $uri->path( $new_path )
604
605Sets and returns the escaped path component of
606the $uri (the part between the host name and the query or fragment).
607The path can never be undefined, but it can be the empty string.
608
609=item $uri->path_query
610
611=item $uri->path_query( $new_path_query )
612
613Sets and returns the escaped path and query
614components as a single entity. The path and the query are
615separated by a "?" character, but the query can itself contain "?".
616
617=item $uri->path_segments
618
619=item $uri->path_segments( $segment, ... )
620
621Sets and returns the path. In a scalar context, it returns
622the same value as $uri->path. In a list context, it returns the
623unescaped path segments that make up the path. Path segments that
624have parameters are returned as an anonymous array. The first element
625is the unescaped path segment proper; subsequent elements are escaped
626parameter strings. Such an anonymous array uses overloading so it can
627be treated as a string too, but this string does not include the
628parameters.
629
630Note that absolute paths have the empty string as their first
631I<path_segment>, i.e. the I<path> C</foo/bar> have 3
632I<path_segments>; "", "foo" and "bar".
633
634=item $uri->query
635
636=item $uri->query( $new_query )
637
638Sets and returns the escaped query component of
639the $uri.
640
641=item $uri->query_form
642
643=item $uri->query_form( $key1 => $val1, $key2 => $val2, ... )
644
645=item $uri->query_form( $key1 => $val1, $key2 => $val2, ..., $delim )
646
647=item $uri->query_form( \@key_value_pairs )
648
649=item $uri->query_form( \@key_value_pairs, $delim )
650
651=item $uri->query_form( \%hash )
652
653=item $uri->query_form( \%hash, $delim )
654
655Sets and returns query components that use the
656I<application/x-www-form-urlencoded> format. Key/value pairs are
657separated by "&", and the key is separated from the value by a "="
658character.
659
660The form can be set either by passing separate key/value pairs, or via
661an array or hash reference. Passing an empty array or an empty hash
662removes the query component, whereas passing no arguments at all leaves
663the component unchanged. The order of keys is undefined if a hash
664reference is passed. The old value is always returned as a list of
665separate key/value pairs. Assigning this list to a hash is unwise as
666the keys returned might repeat.
667
668The values passed when setting the form can be plain strings or
669references to arrays of strings. Passing an array of values has the
670same effect as passing the key repeatedly with one value at a time.
671All the following statements have the same effect:
672
673 $uri->query_form(foo => 1, foo => 2);
674 $uri->query_form(foo => [1, 2]);
675 $uri->query_form([ foo => 1, foo => 2 ]);
676 $uri->query_form([ foo => [1, 2] ]);
677 $uri->query_form({ foo => [1, 2] });
678
679The $delim parameter can be passed as ";" to force the key/value pairs
680to be delimited by ";" instead of "&" in the query string. This
681practice is often recommended for URLs embedded in HTML or XML
682documents as this avoids the trouble of escaping the "&" character.
683You might also set the $URI::DEFAULT_QUERY_FORM_DELIMITER variable to
684";" for the same global effect.
685
686The C<URI::QueryParam> module can be loaded to add further methods to
687manipulate the form of a URI. See L<URI::QueryParam> for details.
688
689=item $uri->query_keywords
690
691=item $uri->query_keywords( $keywords, ... )
692
693=item $uri->query_keywords( \@keywords )
694
695Sets and returns query components that use the
696keywords separated by "+" format.
697
698The keywords can be set either by passing separate keywords directly
699or by passing a reference to an array of keywords. Passing an empty
700array removes the query component, whereas passing no arguments at
701all leaves the component unchanged. The old value is always returned
702as a list of separate words.
703
704=back
705
706=head1 SERVER METHODS
707
708For schemes where the I<authority> component denotes an Internet host,
709the following methods are available in addition to the generic
710methods.
711
712=over 4
713
714=item $uri->userinfo
715
716=item $uri->userinfo( $new_userinfo )
717
718Sets and returns the escaped userinfo part of the
719authority component.
720
721For some schemes this is a user name and a password separated by
722a colon. This practice is not recommended. Embedding passwords in
723clear text (such as URI) has proven to be a security risk in almost
724every case where it has been used.
725
726=item $uri->host
727
728=item $uri->host( $new_host )
729
730Sets and returns the unescaped hostname.
731
732If the $new_host string ends with a colon and a number, then this
733number also sets the port.
734
735For IPv6 addresses the brackets around the raw address is removed in the return
736value from $uri->host. When setting the host attribute to an IPv6 address you
737can use a raw address or one enclosed in brackets. The address needs to be
738enclosed in brackets if you want to pass in a new port value as well.
739
740=item $uri->ihost
741
742Returns the host in Unicode form. Any IDNA A-labels are turned into U-labels.
743
744=item $uri->port
745
746=item $uri->port( $new_port )
747
748Sets and returns the port. The port is a simple integer
749that should be greater than 0.
750
751If a port is not specified explicitly in the URI, then the URI scheme's default port
752is returned. If you don't want the default port
753substituted, then you can use the $uri->_port method instead.
754
755=item $uri->host_port
756
757=item $uri->host_port( $new_host_port )
758
759Sets and returns the host and port as a single
760unit. The returned value includes a port, even if it matches the
761default port. The host part and the port part are separated by a
762colon: ":".
763
764For IPv6 addresses the bracketing is preserved; thus
765URI->new("http://[::1]/")->host_port returns "[::1]:80". Contrast this with
766$uri->host which will remove the brackets.
767
768=item $uri->default_port
769
770Returns the default port of the URI scheme to which $uri
771belongs. For I<http> this is the number 80, for I<ftp> this
772is the number 21, etc. The default port for a scheme can not be
773changed.
774
775=back
776
777=head1 SCHEME-SPECIFIC SUPPORT
778
779Scheme-specific support is provided for the following URI schemes. For C<URI>
780objects that do not belong to one of these, you can only use the common and
781generic methods.
782
783=over 4
784
785=item B<data>:
786
787The I<data> URI scheme is specified in RFC 2397. It allows inclusion
788of small data items as "immediate" data, as if it had been included
789externally.
790
791C<URI> objects belonging to the data scheme support the common methods
792and two new methods to access their scheme-specific components:
793$uri->media_type and $uri->data. See L<URI::data> for details.
794
795=item B<file>:
796
797An old specification of the I<file> URI scheme is found in RFC 1738.
798A new RFC 2396 based specification in not available yet, but file URI
799references are in common use.
800
801C<URI> objects belonging to the file scheme support the common and
802generic methods. In addition, they provide two methods for mapping file URIs
803back to local file names; $uri->file and $uri->dir. See L<URI::file>
804for details.
805
806=item B<ftp>:
807
808An old specification of the I<ftp> URI scheme is found in RFC 1738. A
809new RFC 2396 based specification in not available yet, but ftp URI
810references are in common use.
811
812C<URI> objects belonging to the ftp scheme support the common,
813generic and server methods. In addition, they provide two methods for
814accessing the userinfo sub-components: $uri->user and $uri->password.
815
816=item B<gopher>:
817
818The I<gopher> URI scheme is specified in
819<draft-murali-url-gopher-1996-12-04> and will hopefully be available
820as a RFC 2396 based specification.
821
822C<URI> objects belonging to the gopher scheme support the common,
823generic and server methods. In addition, they support some methods for
824accessing gopher-specific path components: $uri->gopher_type,
825$uri->selector, $uri->search, $uri->string.
826
827=item B<http>:
828
829The I<http> URI scheme is specified in RFC 2616.
830The scheme is used to reference resources hosted by HTTP servers.
831
832C<URI> objects belonging to the http scheme support the common,
833generic and server methods.
834
835=item B<https>:
836
837The I<https> URI scheme is a Netscape invention which is commonly
838implemented. The scheme is used to reference HTTP servers through SSL
839connections. Its syntax is the same as http, but the default
840port is different.
841
842=item B<ldap>:
843
844The I<ldap> URI scheme is specified in RFC 2255. LDAP is the
845Lightweight Directory Access Protocol. An ldap URI describes an LDAP
846search operation to perform to retrieve information from an LDAP
847directory.
848
849C<URI> objects belonging to the ldap scheme support the common,
850generic and server methods as well as ldap-specific methods: $uri->dn,
851$uri->attributes, $uri->scope, $uri->filter, $uri->extensions. See
852L<URI::ldap> for details.
853
854=item B<ldapi>:
855
856Like the I<ldap> URI scheme, but uses a UNIX domain socket. The
857server methods are not supported, and the local socket path is
858available as $uri->un_path. The I<ldapi> scheme is used by the
859OpenLDAP package. There is no real specification for it, but it is
860mentioned in various OpenLDAP manual pages.
861
862=item B<ldaps>:
863
864Like the I<ldap> URI scheme, but uses an SSL connection. This
865scheme is deprecated, as the preferred way is to use the I<start_tls>
866mechanism.
867
868=item B<mailto>:
869
870The I<mailto> URI scheme is specified in RFC 2368. The scheme was
871originally used to designate the Internet mailing address of an
872individual or service. It has (in RFC 2368) been extended to allow
873setting of other mail header fields and the message body.
874
875C<URI> objects belonging to the mailto scheme support the common
876methods and the generic query methods. In addition, they support the
877following mailto-specific methods: $uri->to, $uri->headers.
878
879Note that the "[email protected]" part of a mailto is I<not> the
880C<userinfo> and C<host> but instead the C<path>. This allows a
881mailto URI to contain multiple comma separated email addresses.
882
883=item B<mms>:
884
885The I<mms> URL specification can be found at L<http://sdp.ppona.com/>.
886C<URI> objects belonging to the mms scheme support the common,
887generic, and server methods, with the exception of userinfo and
888query-related sub-components.
889
890=item B<news>:
891
892The I<news>, I<nntp> and I<snews> URI schemes are specified in
893<draft-gilman-news-url-01> and will hopefully be available as an RFC
8942396 based specification soon.
895
896C<URI> objects belonging to the news scheme support the common,
897generic and server methods. In addition, they provide some methods to
898access the path: $uri->group and $uri->message.
899
900=item B<nntp>:
901
902See I<news> scheme.
903
904=item B<pop>:
905
906The I<pop> URI scheme is specified in RFC 2384. The scheme is used to
907reference a POP3 mailbox.
908
909C<URI> objects belonging to the pop scheme support the common, generic
910and server methods. In addition, they provide two methods to access the
911userinfo components: $uri->user and $uri->auth
912
913=item B<rlogin>:
914
915An old specification of the I<rlogin> URI scheme is found in RFC
9161738. C<URI> objects belonging to the rlogin scheme support the
917common, generic and server methods.
918
919=item B<rtsp>:
920
921The I<rtsp> URL specification can be found in section 3.2 of RFC 2326.
922C<URI> objects belonging to the rtsp scheme support the common,
923generic, and server methods, with the exception of userinfo and
924query-related sub-components.
925
926=item B<rtspu>:
927
928The I<rtspu> URI scheme is used to talk to RTSP servers over UDP
929instead of TCP. The syntax is the same as rtsp.
930
931=item B<rsync>:
932
933Information about rsync is available from L<http://rsync.samba.org/>.
934C<URI> objects belonging to the rsync scheme support the common,
935generic and server methods. In addition, they provide methods to
936access the userinfo sub-components: $uri->user and $uri->password.
937
938=item B<sip>:
939
940The I<sip> URI specification is described in sections 19.1 and 25
941of RFC 3261. C<URI> objects belonging to the sip scheme support the
942common, generic, and server methods with the exception of path related
943sub-components. In addition, they provide two methods to get and set
944I<sip> parameters: $uri->params_form and $uri->params.
945
946=item B<sips>:
947
948See I<sip> scheme. Its syntax is the same as sip, but the default
949port is different.
950
951=item B<snews>:
952
953See I<news> scheme. Its syntax is the same as news, but the default
954port is different.
955
956=item B<telnet>:
957
958An old specification of the I<telnet> URI scheme is found in RFC
9591738. C<URI> objects belonging to the telnet scheme support the
960common, generic and server methods.
961
962=item B<tn3270>:
963
964These URIs are used like I<telnet> URIs but for connections to IBM
965mainframes. C<URI> objects belonging to the tn3270 scheme support the
966common, generic and server methods.
967
968=item B<ssh>:
969
970Information about ssh is available at L<http://www.openssh.com/>.
971C<URI> objects belonging to the ssh scheme support the common,
972generic and server methods. In addition, they provide methods to
973access the userinfo sub-components: $uri->user and $uri->password.
974
975=item B<urn>:
976
977The syntax of Uniform Resource Names is specified in RFC 2141. C<URI>
978objects belonging to the urn scheme provide the common methods, and also the
979methods $uri->nid and $uri->nss, which return the Namespace Identifier
980and the Namespace-Specific String respectively.
981
982The Namespace Identifier basically works like the Scheme identifier of
983URIs, and further divides the URN namespace. Namespace Identifier
984assignments are maintained at
985L<http://www.iana.org/assignments/urn-namespaces>.
986
987Letter case is not significant for the Namespace Identifier. It is
988always returned in lower case by the $uri->nid method. The $uri->_nid
989method can be used if you want it in its original case.
990
991=item B<urn>:B<isbn>:
992
993The C<urn:isbn:> namespace contains International Standard Book
994Numbers (ISBNs) and is described in RFC 3187. A C<URI> object belonging
995to this namespace has the following extra methods (if the
996Business::ISBN module is available): $uri->isbn,
997$uri->isbn_publisher_code, $uri->isbn_group_code (formerly isbn_country_code,
998which is still supported by issues a deprecation warning), $uri->isbn_as_ean.
999
1000=item B<urn>:B<oid>:
1001
1002The C<urn:oid:> namespace contains Object Identifiers (OIDs) and is
1003described in RFC 3061. An object identifier consists of sequences of digits
1004separated by dots. A C<URI> object belonging to this namespace has an
1005additional method called $uri->oid that can be used to get/set the oid
1006value. In a list context, oid numbers are returned as separate elements.
1007
1008=back
1009
1010=head1 CONFIGURATION VARIABLES
1011
1012The following configuration variables influence how the class and its
1013methods behave:
1014
1015=over 4
1016
1017=item $URI::ABS_ALLOW_RELATIVE_SCHEME
1018
1019Some older parsers used to allow the scheme name to be present in the
1020relative URL if it was the same as the base URL scheme. RFC 2396 says
1021that this should be avoided, but you can enable this old behaviour by
1022setting the $URI::ABS_ALLOW_RELATIVE_SCHEME variable to a TRUE value.
1023The difference is demonstrated by the following examples:
1024
1025 URI->new("http:foo")->abs("http://host/a/b")
1026 ==> "http:foo"
1027
1028 local $URI::ABS_ALLOW_RELATIVE_SCHEME = 1;
1029 URI->new("http:foo")->abs("http://host/a/b")
1030 ==> "http:/host/a/foo"
1031
1032
1033=item $URI::ABS_REMOTE_LEADING_DOTS
1034
1035You can also have the abs() method ignore excess ".."
1036segments in the relative URI by setting $URI::ABS_REMOTE_LEADING_DOTS
1037to a TRUE value. The difference is demonstrated by the following
1038examples:
1039
1040 URI->new("../../../foo")->abs("http://host/a/b")
1041 ==> "http://host/../../foo"
1042
1043 local $URI::ABS_REMOTE_LEADING_DOTS = 1;
1044 URI->new("../../../foo")->abs("http://host/a/b")
1045 ==> "http://host/foo"
1046
1047=item $URI::DEFAULT_QUERY_FORM_DELIMITER
1048
1049This value can be set to ";" to have the query form C<key=value> pairs
1050delimited by ";" instead of "&" which is the default.
1051
1052=back
1053
1054=head1 BUGS
1055
1056There are some things that are not quite right:
1057
1058=over
1059
1060=item *
1061
1062Using regexp variables like $1 directly as arguments to the URI accessor methods
1063does not work too well with current perl implementations. I would argue
1064that this is actually a bug in perl. The workaround is to quote
1065them. Example:
1066
1067 /(...)/ || die;
1068 $u->query("$1");
1069
1070
1071=item *
1072
1073The escaping (percent encoding) of chars in the 128 .. 255 range passed to the
1074URI constructor or when setting URI parts using the accessor methods depend on
1075the state of the internal UTF8 flag (see utf8::is_utf8) of the string passed.
1076If the UTF8 flag is set the UTF-8 encoded version of the character is percent
1077encoded. If the UTF8 flag isn't set the Latin-1 version (byte) of the
1078character is percent encoded. This basically exposes the internal encoding of
1079Perl strings.
1080
1081=back
1082
1083=head1 PARSING URIs WITH REGEXP
1084
1085As an alternative to this module, the following (official) regular
1086expression can be used to decode a URI:
1087
1088 my($scheme, $authority, $path, $query, $fragment) =
1089 $uri =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
1090
1091The C<URI::Split> module provides the function uri_split() as a
1092readable alternative.
1093
1094=head1 SEE ALSO
1095
1096L<URI::file>, L<URI::WithBase>, L<URI::QueryParam>, L<URI::Escape>,
1097L<URI::Split>, L<URI::Heuristic>
1098
1099RFC 2396: "Uniform Resource Identifiers (URI): Generic Syntax",
1100Berners-Lee, Fielding, Masinter, August 1998.
1101
1102L<http://www.iana.org/assignments/uri-schemes>
1103
1104L<http://www.iana.org/assignments/urn-namespaces>
1105
1106L<http://www.w3.org/Addressing/>
1107
1108=head1 COPYRIGHT
1109
1110Copyright 1995-2009 Gisle Aas.
1111
1112Copyright 1995 Martijn Koster.
1113
1114This program is free software; you can redistribute it and/or modify
1115it under the same terms as Perl itself.
1116
1117=head1 AUTHORS / ACKNOWLEDGMENTS
1118
1119This module is based on the C<URI::URL> module, which in turn was
1120(distantly) based on the C<wwwurl.pl> code in the libwww-perl for
1121perl4 developed by Roy Fielding, as part of the Arcadia project at the
1122University of California, Irvine, with contributions from Brooks
1123Cutter.
1124
1125C<URI::URL> was developed by Gisle Aas, Tim Bunce, Roy Fielding and
1126Martijn Koster with input from other people on the libwww-perl mailing
1127list.
1128
1129C<URI> and related subclasses was developed by Gisle Aas.
1130
1131=cut
Note: See TracBrowser for help on using the repository browser.