Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

URI.pm@ 31957

Last change on this file since 31957 was 27191, checked in by davidb, 11 years ago
This should be in the 'cpan' area
File size: 33.0 KB

Line
1	package URI;
2
3	use strict;
4	use vars qw($VERSION);
5	$VERSION = "1.60";
6
7	use vars qw($ABS_REMOTE_LEADING_DOTS $ABS_ALLOW_RELATIVE_SCHEME $DEFAULT_QUERY_FORM_DELIMITER);
8
9	my %implements; # mapping from scheme to implementor class
10
11	# Some "official" character classes
12
13	use vars qw($reserved $mark $unreserved $uric $scheme_re);
14	$reserved = q(;/?:@&=+$,[]);
15	$mark = q(-_.!~*'()); #'; emacs
16	$unreserved = "A-Za-z0-9\Q$mark\E";
17	$uric = quotemeta($reserved) . $unreserved . "%";
18
19	$scheme_re = '[a-zA-Z][a-zA-Z0-9.+\-]*';
20
21	use Carp ();
22	use URI::Escape ();
23
24	use overload ('""' => sub { ${$_[0]} },
25	'==' => sub { _obj_eq(@_) },
26	'!=' => sub { !_obj_eq(@_) },
27	fallback => 1,
28	);
29
30	# Check if two objects are the same object
31	sub _obj_eq {
32	return overload::StrVal($_[0]) eq overload::StrVal($_[1]);
33	}
34
35	sub new
36	{
37	my($class, $uri, $scheme) = @_;
38
39	$uri = defined ($uri) ? "$uri" : ""; # stringify
40	# Get rid of potential wrapping
41	$uri =~ s/^<(?:URL:)?(.*)>$/$1/; #
42	$uri =~ s/^"(.*)"$/$1/;
43	$uri =~ s/^\s+//;
44	$uri =~ s/\s+$//;
45
46	my $impclass;
47	if ($uri =~ m/^($scheme_re):/so) {
48	$scheme = $1;
49	}
50	else {
51	if (($impclass = ref($scheme))) {
52	$scheme = $scheme->scheme;
53	}
54	elsif ($scheme && $scheme =~ m/^($scheme_re)(?::\|$)/o) {
55	$scheme = $1;
56	}
57	}
58	$impclass \|\|= implementor($scheme) \|\|
59	do {
60	require URI::_foreign;
61	$impclass = 'URI::_foreign';
62	};
63
64	return $impclass->_init($uri, $scheme);
65	}
66
67
68	sub new_abs
69	{
70	my($class, $uri, $base) = @_;
71	$uri = $class->new($uri, $base);
72	$uri->abs($base);
73	}
74
75
76	sub _init
77	{
78	my $class = shift;
79	my($str, $scheme) = @_;
80	# find all funny characters and encode the bytes.
81	$str = $class->_uric_escape($str);
82	$str = "$scheme:$str" unless $str =~ /^$scheme_re:/o \|\|
83	$class->_no_scheme_ok;
84	my $self = bless \$str, $class;
85	$self;
86	}
87
88
89	sub _uric_escape
90	{
91	my($class, $str) = @_;
92	$str =~ s([^$uric\#]) URI::Escape::escape_char($1) *ego;
93	utf8::downgrade($str);
94	return $str;
95	}
96
97
98	sub implementor
99	{
100	my($scheme, $impclass) = @_;
101	if (!$scheme \|\| $scheme !~ /\A$scheme_re\z/o) {
102	require URI::_generic;
103	return "URI::_generic";
104	}
105
106	$scheme = lc($scheme);
107
108	if ($impclass) {
109	# Set the implementor class for a given scheme
110	my $old = $implements{$scheme};
111	$impclass->_init_implementor($scheme);
112	$implements{$scheme} = $impclass;
113	return $old;
114	}
115
116	my $ic = $implements{$scheme};
117	return $ic if $ic;
118
119	# scheme not yet known, look for internal or
120	# preloaded (with 'use') implementation
121	$ic = "URI::$scheme"; # default location
122
123	# turn scheme into a valid perl identifier by a simple transformation...
124	$ic =~ s/\+/_P/g;
125	$ic =~ s/\./_O/g;
126	$ic =~ s/\-/_/g;
127
128	no strict 'refs';
129	# check we actually have one for the scheme:
130	unless (@{"${ic}::ISA"}) {
131	# Try to load it
132	eval "require $ic";
133	die $@ if $@ && $@ !~ /Can\'t locate.*in \@INC/;
134	return unless @{"${ic}::ISA"};
135	}
136
137	$ic->_init_implementor($scheme);
138	$implements{$scheme} = $ic;
139	$ic;
140	}
141
142
143	sub _init_implementor
144	{
145	my($class, $scheme) = @_;
146	# Remember that one implementor class may actually
147	# serve to implement several URI schemes.
148	}
149
150
151	sub clone
152	{
153	my $self = shift;
154	my $other = $$self;
155	bless \$other, ref $self;
156	}
157
158
159	sub _no_scheme_ok { 0 }
160
161	sub _scheme
162	{
163	my $self = shift;
164
165	unless (@_) {
166	return unless $$self =~ /^($scheme_re):/o;
167	return $1;
168	}
169
170	my $old;
171	my $new = shift;
172	if (defined($new) && length($new)) {
173	Carp::croak("Bad scheme '$new'") unless $new =~ /^$scheme_re$/o;
174	$old = $1 if $$self =~ s/^($scheme_re)://o;
175	my $newself = URI->new("$new:$$self");
176	$$self = $$newself;
177	bless $self, ref($newself);
178	}
179	else {
180	if ($self->_no_scheme_ok) {
181	$old = $1 if $$self =~ s/^($scheme_re)://o;
182	Carp::carp("Oops, opaque part now look like scheme")
183	if $^W && $$self =~ m/^$scheme_re:/o
184	}
185	else {
186	$old = $1 if $$self =~ m/^($scheme_re):/o;
187	}
188	}
189
190	return $old;
191	}
192
193	sub scheme
194	{
195	my $scheme = shift->_scheme(@_);
196	return unless defined $scheme;
197	lc($scheme);
198	}
199
200
201	sub opaque
202	{
203	my $self = shift;
204
205	unless (@_) {
206	$$self =~ /^(?:$scheme_re:)?([^\#]*)/o or die;
207	return $1;
208	}
209
210	$$self =~ /^($scheme_re:)? # optional scheme
211	([^\#]*) # opaque
212	(\#.*)? # optional fragment
213	$/sx or die;
214
215	my $old_scheme = $1;
216	my $old_opaque = $2;
217	my $old_frag = $3;
218
219	my $new_opaque = shift;
220	$new_opaque = "" unless defined $new_opaque;
221	$new_opaque =~ s/([^$uric])/ URI::Escape::escape_char($1)/ego;
222	utf8::downgrade($new_opaque);
223
224	$$self = defined($old_scheme) ? $old_scheme : "";
225	$$self .= $new_opaque;
226	$$self .= $old_frag if defined $old_frag;
227
228	$old_opaque;
229	}
230
231	*path = \&opaque; # alias
232
233
234	sub fragment
235	{
236	my $self = shift;
237	unless (@_) {
238	return unless $$self =~ /\#(.*)/s;
239	return $1;
240	}
241
242	my $old;
243	$old = $1 if $$self =~ s/\#(.*)//s;
244
245	my $new_frag = shift;
246	if (defined $new_frag) {
247	$new_frag =~ s/([^$uric])/ URI::Escape::escape_char($1) /ego;
248	utf8::downgrade($new_frag);
249	$$self .= "#$new_frag";
250	}
251	$old;
252	}
253
254
255	sub as_string
256	{
257	my $self = shift;
258	$$self;
259	}
260
261
262	sub as_iri
263	{
264	my $self = shift;
265	my $str = $$self;
266	if ($str =~ s/%([89a-fA-F][0-9a-fA-F])/chr(hex($1))/eg) {
267	# All this crap because the more obvious:
268	#
269	# Encode::decode("UTF-8", $str, sub { sprintf "%%%02X", shift })
270	#
271	# doesn't work before Encode 2.39. Wait for a standard release
272	# to bundle that version.
273
274	require Encode;
275	my $enc = Encode::find_encoding("UTF-8");
276	my $u = "";
277	while (length $str) {
278	$u .= $enc->decode($str, Encode::FB_QUIET());
279	if (length $str) {
280	# escape next char
281	$u .= URI::Escape::escape_char(substr($str, 0, 1, ""));
282	}
283	}
284	$str = $u;
285	}
286	return $str;
287	}
288
289
290	sub canonical
291	{
292	# Make sure scheme is lowercased, that we don't escape unreserved chars,
293	# and that we use upcase escape sequences.
294
295	my $self = shift;
296	my $scheme = $self->_scheme \|\| "";
297	my $uc_scheme = $scheme =~ /[A-Z]/;
298	my $esc = $$self =~ /%[a-fA-F0-9]{2}/;
299	return $self unless $uc_scheme \|\| $esc;
300
301	my $other = $self->clone;
302	if ($uc_scheme) {
303	$other->_scheme(lc $scheme);
304	}
305	if ($esc) {
306	$$other =~ s{%([0-9a-fA-F]{2})}
307	{ my $a = chr(hex($1));
308	$a =~ /^[$unreserved]\z/o ? $a : "%\U$1"
309	}ge;
310	}
311	return $other;
312	}
313
314	# Compare two URIs, subclasses will provide a more correct implementation
315	sub eq {
316	my($self, $other) = @_;
317	$self = URI->new($self, $other) unless ref $self;
318	$other = URI->new($other, $self) unless ref $other;
319	ref($self) eq ref($other) && # same class
320	$self->canonical->as_string eq $other->canonical->as_string;
321	}
322
323	# generic-URI transformation methods
324	sub abs { $_[0]; }
325	sub rel { $_[0]; }
326
327	sub secure { 0 }
328
329	# help out Storable
330	sub STORABLE_freeze {
331	my($self, $cloning) = @_;
332	return $$self;
333	}
334
335	sub STORABLE_thaw {
336	my($self, $cloning, $str) = @_;
337	$$self = $str;
338	}
339
340	1;
341
342	__END__
343
344	=head1 NAME
345
346	URI - Uniform Resource Identifiers (absolute and relative)
347
348	=head1 SYNOPSIS
349
350	$u1 = URI->new("http://www.perl.com");
351	$u2 = URI->new("foo", "http");
352	$u3 = $u2->abs($u1);
353	$u4 = $u3->clone;
354	$u5 = URI->new("HTTP://WWW.perl.com:80")->canonical;
355
356	$str = $u->as_string;
357	$str = "$u";
358
359	$scheme = $u->scheme;
360	$opaque = $u->opaque;
361	$path = $u->path;
362	$frag = $u->fragment;
363
364	$u->scheme("ftp");
365	$u->host("ftp.perl.com");
366	$u->path("cpan/");
367
368	=head1 DESCRIPTION
369
370	This module implements the C<URI> class. Objects of this class
371	represent "Uniform Resource Identifier references" as specified in RFC
372	2396 (and updated by RFC 2732).
373
374	A Uniform Resource Identifier is a compact string of characters that
375	identifies an abstract or physical resource. A Uniform Resource
376	Identifier can be further classified as either a Uniform Resource Locator
377	(URL) or a Uniform Resource Name (URN). The distinction between URL
378	and URN does not matter to the C<URI> class interface. A
379	"URI-reference" is a URI that may have additional information attached
380	in the form of a fragment identifier.
381
382	An absolute URI reference consists of three parts: a I<scheme>, a
383	I<scheme-specific part> and a I<fragment> identifier. A subset of URI
384	references share a common syntax for hierarchical namespaces. For
385	these, the scheme-specific part is further broken down into
386	I<authority>, I<path> and I<query> components. These URIs can also
387	take the form of relative URI references, where the scheme (and
388	usually also the authority) component is missing, but implied by the
389	context of the URI reference. The three forms of URI reference
390	syntax are summarized as follows:
391
392	<scheme>:<scheme-specific-part>#<fragment>
393	<scheme>://<authority><path>?<query>#<fragment>
394	<path>?<query>#<fragment>
395
396	The components into which a URI reference can be divided depend on the
397	I<scheme>. The C<URI> class provides methods to get and set the
398	individual components. The methods available for a specific
399	C<URI> object depend on the scheme.
400
401	=head1 CONSTRUCTORS
402
403	The following methods construct new C<URI> objects:
404
405	=over 4
406
407	=item $uri = URI->new( $str )
408
409	=item $uri = URI->new( $str, $scheme )
410
411	Constructs a new URI object. The string
412	representation of a URI is given as argument, together with an optional
413	scheme specification. Common URI wrappers like "" and <>, as well as
414	leading and trailing white space, are automatically removed from
415	the $str argument before it is processed further.
416
417	The constructor determines the scheme, maps this to an appropriate
418	URI subclass, constructs a new object of that class and returns it.
419
420	The $scheme argument is only used when $str is a
421	relative URI. It can be either a simple string that
422	denotes the scheme, a string containing an absolute URI reference, or
423	an absolute C<URI> object. If no $scheme is specified for a relative
424	URI $str, then $str is simply treated as a generic URI (no scheme-specific
425	methods available).
426
427	The set of characters available for building URI references is
428	restricted (see L<URI::Escape>). Characters outside this set are
429	automatically escaped by the URI constructor.
430
431	=item $uri = URI->new_abs( $str, $base_uri )
432
433	Constructs a new absolute URI object. The $str argument can
434	denote a relative or absolute URI. If relative, then it is
435	absolutized using $base_uri as base. The $base_uri must be an absolute
436	URI.
437
438	=item $uri = URI::file->new( $filename )
439
440	=item $uri = URI::file->new( $filename, $os )
441
442	Constructs a new I<file> URI from a file name. See L<URI::file>.
443
444	=item $uri = URI::file->new_abs( $filename )
445
446	=item $uri = URI::file->new_abs( $filename, $os )
447
448	Constructs a new absolute I<file> URI from a file name. See
449	L<URI::file>.
450
451	=item $uri = URI::file->cwd
452
453	Returns the current working directory as a I<file> URI. See
454	L<URI::file>.
455
456	=item $uri->clone
457
458	Returns a copy of the $uri.
459
460	=back
461
462	=head1 COMMON METHODS
463
464	The methods described in this section are available for all C<URI>
465	objects.
466
467	Methods that give access to components of a URI always return the
468	old value of the component. The value returned is C<undef> if the
469	component was not present. There is generally a difference between a
470	component that is empty (represented as C<"">) and a component that is
471	missing (represented as C<undef>). If an accessor method is given an
472	argument, it updates the corresponding component in addition to
473	returning the old value of the component. Passing an undefined
474	argument removes the component (if possible). The description of
475	each accessor method indicates whether the component is passed as
476	an escaped (percent-encoded) or an unescaped string. A component that can be further
477	divided into sub-parts are usually passed escaped, as unescaping might
478	change its semantics.
479
480	The common methods available for all URI are:
481
482	=over 4
483
484	=item $uri->scheme
485
486	=item $uri->scheme( $new_scheme )
487
488	Sets and returns the scheme part of the $uri. If the $uri is
489	relative, then $uri->scheme returns C<undef>. If called with an
490	argument, it updates the scheme of $uri, possibly changing the
491	class of $uri, and returns the old scheme value. The method croaks
492	if the new scheme name is illegal; a scheme name must begin with a
493	letter and must consist of only US-ASCII letters, numbers, and a few
494	special marks: ".", "+", "-". This restriction effectively means
495	that the scheme must be passed unescaped. Passing an undefined
496	argument to the scheme method makes the URI relative (if possible).
497
498	Letter case does not matter for scheme names. The string
499	returned by $uri->scheme is always lowercase. If you want the scheme
500	just as it was written in the URI in its original case,
501	you can use the $uri->_scheme method instead.
502
503	=item $uri->opaque
504
505	=item $uri->opaque( $new_opaque )
506
507	Sets and returns the scheme-specific part of the $uri
508	(everything between the scheme and the fragment)
509	as an escaped string.
510
511	=item $uri->path
512
513	=item $uri->path( $new_path )
514
515	Sets and returns the same value as $uri->opaque unless the URI
516	supports the generic syntax for hierarchical namespaces.
517	In that case the generic method is overridden to set and return
518	the part of the URI between the I<host name> and the I<fragment>.
519
520	=item $uri->fragment
521
522	=item $uri->fragment( $new_frag )
523
524	Returns the fragment identifier of a URI reference
525	as an escaped string.
526
527	=item $uri->as_string
528
529	Returns a URI object to a plain ASCII string. URI objects are
530	also converted to plain strings automatically by overloading. This
531	means that $uri objects can be used as plain strings in most Perl
532	constructs.
533
534	=item $uri->as_iri
535
536	Returns a Unicode string representing the URI. Escaped UTF-8 sequences
537	representing non-ASCII characters are turned into their corresponding Unicode
538	code point.
539
540	=item $uri->canonical
541
542	Returns a normalized version of the URI. The rules
543	for normalization are scheme-dependent. They usually involve
544	lowercasing the scheme and Internet host name components,
545	removing the explicit port specification if it matches the default port,
546	uppercasing all escape sequences, and unescaping octets that can be
547	better represented as plain characters.
548
549	For efficiency reasons, if the $uri is already in normalized form,
550	then a reference to it is returned instead of a copy.
551
552	=item $uri->eq( $other_uri )
553
554	=item URI::eq( $first_uri, $other_uri )
555
556	Tests whether two URI references are equal. URI references
557	that normalize to the same string are considered equal. The method
558	can also be used as a plain function which can also test two string
559	arguments.
560
561	If you need to test whether two C<URI> object references denote the
562	same object, use the '==' operator.
563
564	=item $uri->abs( $base_uri )
565
566	Returns an absolute URI reference. If $uri is already
567	absolute, then a reference to it is simply returned. If the $uri
568	is relative, then a new absolute URI is constructed by combining the
569	$uri and the $base_uri, and returned.
570
571	=item $uri->rel( $base_uri )
572
573	Returns a relative URI reference if it is possible to
574	make one that denotes the same resource relative to $base_uri.
575	If not, then $uri is simply returned.
576
577	=item $uri->secure
578
579	Returns a TRUE value if the URI is considered to point to a resource on
580	a secure channel, such as an SSL or TLS encrypted one.
581
582	=back
583
584	=head1 GENERIC METHODS
585
586	The following methods are available to schemes that use the
587	common/generic syntax for hierarchical namespaces. The descriptions of
588	schemes below indicate which these are. Unknown schemes are
589	assumed to support the generic syntax, and therefore the following
590	methods:
591
592	=over 4
593
594	=item $uri->authority
595
596	=item $uri->authority( $new_authority )
597
598	Sets and returns the escaped authority component
599	of the $uri.
600
601	=item $uri->path
602
603	=item $uri->path( $new_path )
604
605	Sets and returns the escaped path component of
606	the $uri (the part between the host name and the query or fragment).
607	The path can never be undefined, but it can be the empty string.
608
609	=item $uri->path_query
610
611	=item $uri->path_query( $new_path_query )
612
613	Sets and returns the escaped path and query
614	components as a single entity. The path and the query are
615	separated by a "?" character, but the query can itself contain "?".
616
617	=item $uri->path_segments
618
619	=item $uri->path_segments( $segment, ... )
620
621	Sets and returns the path. In a scalar context, it returns
622	the same value as $uri->path. In a list context, it returns the
623	unescaped path segments that make up the path. Path segments that
624	have parameters are returned as an anonymous array. The first element
625	is the unescaped path segment proper; subsequent elements are escaped
626	parameter strings. Such an anonymous array uses overloading so it can
627	be treated as a string too, but this string does not include the
628	parameters.
629
630	Note that absolute paths have the empty string as their first
631	I<path_segment>, i.e. the I<path> C</foo/bar> have 3
632	I<path_segments>; "", "foo" and "bar".
633
634	=item $uri->query
635
636	=item $uri->query( $new_query )
637
638	Sets and returns the escaped query component of
639	the $uri.
640
641	=item $uri->query_form
642
643	=item $uri->query_form( $key1 => $val1, $key2 => $val2, ... )
644
645	=item $uri->query_form( $key1 => $val1, $key2 => $val2, ..., $delim )
646
647	=item $uri->query_form( \@key_value_pairs )
648
649	=item $uri->query_form( \@key_value_pairs, $delim )
650
651	=item $uri->query_form( \%hash )
652
653	=item $uri->query_form( \%hash, $delim )
654
655	Sets and returns query components that use the
656	I<application/x-www-form-urlencoded> format. Key/value pairs are
657	separated by "&", and the key is separated from the value by a "="
658	character.
659
660	The form can be set either by passing separate key/value pairs, or via
661	an array or hash reference. Passing an empty array or an empty hash
662	removes the query component, whereas passing no arguments at all leaves
663	the component unchanged. The order of keys is undefined if a hash
664	reference is passed. The old value is always returned as a list of
665	separate key/value pairs. Assigning this list to a hash is unwise as
666	the keys returned might repeat.
667
668	The values passed when setting the form can be plain strings or
669	references to arrays of strings. Passing an array of values has the
670	same effect as passing the key repeatedly with one value at a time.
671	All the following statements have the same effect:
672
673	$uri->query_form(foo => 1, foo => 2);
674	$uri->query_form(foo => [1, 2]);
675	$uri->query_form([ foo => 1, foo => 2 ]);
676	$uri->query_form([ foo => [1, 2] ]);
677	$uri->query_form({ foo => [1, 2] });
678
679	The $delim parameter can be passed as ";" to force the key/value pairs
680	to be delimited by ";" instead of "&" in the query string. This
681	practice is often recommended for URLs embedded in HTML or XML
682	documents as this avoids the trouble of escaping the "&" character.
683	You might also set the $URI::DEFAULT_QUERY_FORM_DELIMITER variable to
684	";" for the same global effect.
685
686	The C<URI::QueryParam> module can be loaded to add further methods to
687	manipulate the form of a URI. See L<URI::QueryParam> for details.
688
689	=item $uri->query_keywords
690
691	=item $uri->query_keywords( $keywords, ... )
692
693	=item $uri->query_keywords( \@keywords )
694
695	Sets and returns query components that use the
696	keywords separated by "+" format.
697
698	The keywords can be set either by passing separate keywords directly
699	or by passing a reference to an array of keywords. Passing an empty
700	array removes the query component, whereas passing no arguments at
701	all leaves the component unchanged. The old value is always returned
702	as a list of separate words.
703
704	=back
705
706	=head1 SERVER METHODS
707
708	For schemes where the I<authority> component denotes an Internet host,
709	the following methods are available in addition to the generic
710	methods.
711
712	=over 4
713
714	=item $uri->userinfo
715
716	=item $uri->userinfo( $new_userinfo )
717
718	Sets and returns the escaped userinfo part of the
719	authority component.
720
721	For some schemes this is a user name and a password separated by
722	a colon. This practice is not recommended. Embedding passwords in
723	clear text (such as URI) has proven to be a security risk in almost
724	every case where it has been used.
725
726	=item $uri->host
727
728	=item $uri->host( $new_host )
729
730	Sets and returns the unescaped hostname.
731
732	If the $new_host string ends with a colon and a number, then this
733	number also sets the port.
734
735	For IPv6 addresses the brackets around the raw address is removed in the return
736	value from $uri->host. When setting the host attribute to an IPv6 address you
737	can use a raw address or one enclosed in brackets. The address needs to be
738	enclosed in brackets if you want to pass in a new port value as well.
739
740	=item $uri->ihost
741
742	Returns the host in Unicode form. Any IDNA A-labels are turned into U-labels.
743
744	=item $uri->port
745
746	=item $uri->port( $new_port )
747
748	Sets and returns the port. The port is a simple integer
749	that should be greater than 0.
750
751	If a port is not specified explicitly in the URI, then the URI scheme's default port
752	is returned. If you don't want the default port
753	substituted, then you can use the $uri->_port method instead.
754
755	=item $uri->host_port
756
757	=item $uri->host_port( $new_host_port )
758
759	Sets and returns the host and port as a single
760	unit. The returned value includes a port, even if it matches the
761	default port. The host part and the port part are separated by a
762	colon: ":".
763
764	For IPv6 addresses the bracketing is preserved; thus
765	URI->new("http://[::1]/")->host_port returns "[::1]:80". Contrast this with
766	$uri->host which will remove the brackets.
767
768	=item $uri->default_port
769
770	Returns the default port of the URI scheme to which $uri
771	belongs. For I<http> this is the number 80, for I<ftp> this
772	is the number 21, etc. The default port for a scheme can not be
773	changed.
774
775	=back
776
777	=head1 SCHEME-SPECIFIC SUPPORT
778
779	Scheme-specific support is provided for the following URI schemes. For C<URI>
780	objects that do not belong to one of these, you can only use the common and
781	generic methods.
782
783	=over 4
784
785	=item B<data>:
786
787	The I<data> URI scheme is specified in RFC 2397. It allows inclusion
788	of small data items as "immediate" data, as if it had been included
789	externally.
790
791	C<URI> objects belonging to the data scheme support the common methods
792	and two new methods to access their scheme-specific components:
793	$uri->media_type and $uri->data. See L<URI::data> for details.
794
795	=item B<file>:
796
797	An old specification of the I<file> URI scheme is found in RFC 1738.
798	A new RFC 2396 based specification in not available yet, but file URI
799	references are in common use.
800
801	C<URI> objects belonging to the file scheme support the common and
802	generic methods. In addition, they provide two methods for mapping file URIs
803	back to local file names; $uri->file and $uri->dir. See L<URI::file>
804	for details.
805
806	=item B<ftp>:
807
808	An old specification of the I<ftp> URI scheme is found in RFC 1738. A
809	new RFC 2396 based specification in not available yet, but ftp URI
810	references are in common use.
811
812	C<URI> objects belonging to the ftp scheme support the common,
813	generic and server methods. In addition, they provide two methods for
814	accessing the userinfo sub-components: $uri->user and $uri->password.
815
816	=item B<gopher>:
817
818	The I<gopher> URI scheme is specified in
819	<draft-murali-url-gopher-1996-12-04> and will hopefully be available
820	as a RFC 2396 based specification.
821
822	C<URI> objects belonging to the gopher scheme support the common,
823	generic and server methods. In addition, they support some methods for
824	accessing gopher-specific path components: $uri->gopher_type,
825	$uri->selector, $uri->search, $uri->string.
826
827	=item B<http>:
828
829	The I<http> URI scheme is specified in RFC 2616.
830	The scheme is used to reference resources hosted by HTTP servers.
831
832	C<URI> objects belonging to the http scheme support the common,
833	generic and server methods.
834
835	=item B<https>:
836
837	The I<https> URI scheme is a Netscape invention which is commonly
838	implemented. The scheme is used to reference HTTP servers through SSL
839	connections. Its syntax is the same as http, but the default
840	port is different.
841
842	=item B<ldap>:
843
844	The I<ldap> URI scheme is specified in RFC 2255. LDAP is the
845	Lightweight Directory Access Protocol. An ldap URI describes an LDAP
846	search operation to perform to retrieve information from an LDAP
847	directory.
848
849	C<URI> objects belonging to the ldap scheme support the common,
850	generic and server methods as well as ldap-specific methods: $uri->dn,
851	$uri->attributes, $uri->scope, $uri->filter, $uri->extensions. See
852	L<URI::ldap> for details.
853
854	=item B<ldapi>:
855
856	Like the I<ldap> URI scheme, but uses a UNIX domain socket. The
857	server methods are not supported, and the local socket path is
858	available as $uri->un_path. The I<ldapi> scheme is used by the
859	OpenLDAP package. There is no real specification for it, but it is
860	mentioned in various OpenLDAP manual pages.
861
862	=item B<ldaps>:
863
864	Like the I<ldap> URI scheme, but uses an SSL connection. This
865	scheme is deprecated, as the preferred way is to use the I<start_tls>
866	mechanism.
867
868	=item B<mailto>:
869
870	The I<mailto> URI scheme is specified in RFC 2368. The scheme was
871	originally used to designate the Internet mailing address of an
872	individual or service. It has (in RFC 2368) been extended to allow
873	setting of other mail header fields and the message body.
874
875	C<URI> objects belonging to the mailto scheme support the common
876	methods and the generic query methods. In addition, they support the
877	following mailto-specific methods: $uri->to, $uri->headers.
878
879	Note that the "[email protected]" part of a mailto is I<not> the
880	C<userinfo> and C<host> but instead the C<path>. This allows a
881	mailto URI to contain multiple comma separated email addresses.
882
883	=item B<mms>:
884
885	The I<mms> URL specification can be found at L<http://sdp.ppona.com/>.
886	C<URI> objects belonging to the mms scheme support the common,
887	generic, and server methods, with the exception of userinfo and
888	query-related sub-components.
889
890	=item B<news>:
891
892	The I<news>, I<nntp> and I<snews> URI schemes are specified in
893	<draft-gilman-news-url-01> and will hopefully be available as an RFC
894	2396 based specification soon.
895
896	C<URI> objects belonging to the news scheme support the common,
897	generic and server methods. In addition, they provide some methods to
898	access the path: $uri->group and $uri->message.
899
900	=item B<nntp>:
901
902	See I<news> scheme.
903
904	=item B<pop>:
905
906	The I<pop> URI scheme is specified in RFC 2384. The scheme is used to
907	reference a POP3 mailbox.
908
909	C<URI> objects belonging to the pop scheme support the common, generic
910	and server methods. In addition, they provide two methods to access the
911	userinfo components: $uri->user and $uri->auth
912
913	=item B<rlogin>:
914
915	An old specification of the I<rlogin> URI scheme is found in RFC
916	1738. C<URI> objects belonging to the rlogin scheme support the
917	common, generic and server methods.
918
919	=item B<rtsp>:
920
921	The I<rtsp> URL specification can be found in section 3.2 of RFC 2326.
922	C<URI> objects belonging to the rtsp scheme support the common,
923	generic, and server methods, with the exception of userinfo and
924	query-related sub-components.
925
926	=item B<rtspu>:
927
928	The I<rtspu> URI scheme is used to talk to RTSP servers over UDP
929	instead of TCP. The syntax is the same as rtsp.
930
931	=item B<rsync>:
932
933	Information about rsync is available from L<http://rsync.samba.org/>.
934	C<URI> objects belonging to the rsync scheme support the common,
935	generic and server methods. In addition, they provide methods to
936	access the userinfo sub-components: $uri->user and $uri->password.
937
938	=item B<sip>:
939
940	The I<sip> URI specification is described in sections 19.1 and 25
941	of RFC 3261. C<URI> objects belonging to the sip scheme support the
942	common, generic, and server methods with the exception of path related
943	sub-components. In addition, they provide two methods to get and set
944	I<sip> parameters: $uri->params_form and $uri->params.
945
946	=item B<sips>:
947
948	See I<sip> scheme. Its syntax is the same as sip, but the default
949	port is different.
950
951	=item B<snews>:
952
953	See I<news> scheme. Its syntax is the same as news, but the default
954	port is different.
955
956	=item B<telnet>:
957
958	An old specification of the I<telnet> URI scheme is found in RFC
959	1738. C<URI> objects belonging to the telnet scheme support the
960	common, generic and server methods.
961
962	=item B<tn3270>:
963
964	These URIs are used like I<telnet> URIs but for connections to IBM
965	mainframes. C<URI> objects belonging to the tn3270 scheme support the
966	common, generic and server methods.
967
968	=item B<ssh>:
969
970	Information about ssh is available at L<http://www.openssh.com/>.
971	C<URI> objects belonging to the ssh scheme support the common,
972	generic and server methods. In addition, they provide methods to
973	access the userinfo sub-components: $uri->user and $uri->password.
974
975	=item B<urn>:
976
977	The syntax of Uniform Resource Names is specified in RFC 2141. C<URI>
978	objects belonging to the urn scheme provide the common methods, and also the
979	methods $uri->nid and $uri->nss, which return the Namespace Identifier
980	and the Namespace-Specific String respectively.
981
982	The Namespace Identifier basically works like the Scheme identifier of
983	URIs, and further divides the URN namespace. Namespace Identifier
984	assignments are maintained at
985	L<http://www.iana.org/assignments/urn-namespaces>.
986
987	Letter case is not significant for the Namespace Identifier. It is
988	always returned in lower case by the $uri->nid method. The $uri->_nid
989	method can be used if you want it in its original case.
990
991	=item B<urn>:B<isbn>:
992
993	The C<urn:isbn:> namespace contains International Standard Book
994	Numbers (ISBNs) and is described in RFC 3187. A C<URI> object belonging
995	to this namespace has the following extra methods (if the
996	Business::ISBN module is available): $uri->isbn,
997	$uri->isbn_publisher_code, $uri->isbn_group_code (formerly isbn_country_code,
998	which is still supported by issues a deprecation warning), $uri->isbn_as_ean.
999
1000	=item B<urn>:B<oid>:
1001
1002	The C<urn:oid:> namespace contains Object Identifiers (OIDs) and is
1003	described in RFC 3061. An object identifier consists of sequences of digits
1004	separated by dots. A C<URI> object belonging to this namespace has an
1005	additional method called $uri->oid that can be used to get/set the oid
1006	value. In a list context, oid numbers are returned as separate elements.
1007
1008	=back
1009
1010	=head1 CONFIGURATION VARIABLES
1011
1012	The following configuration variables influence how the class and its
1013	methods behave:
1014
1015	=over 4
1016
1017	=item $URI::ABS_ALLOW_RELATIVE_SCHEME
1018
1019	Some older parsers used to allow the scheme name to be present in the
1020	relative URL if it was the same as the base URL scheme. RFC 2396 says
1021	that this should be avoided, but you can enable this old behaviour by
1022	setting the $URI::ABS_ALLOW_RELATIVE_SCHEME variable to a TRUE value.
1023	The difference is demonstrated by the following examples:
1024
1025	URI->new("http:foo")->abs("http://host/a/b")
1026	==> "http:foo"
1027
1028	local $URI::ABS_ALLOW_RELATIVE_SCHEME = 1;
1029	URI->new("http:foo")->abs("http://host/a/b")
1030	==> "http:/host/a/foo"
1031
1032
1033	=item $URI::ABS_REMOTE_LEADING_DOTS
1034
1035	You can also have the abs() method ignore excess ".."
1036	segments in the relative URI by setting $URI::ABS_REMOTE_LEADING_DOTS
1037	to a TRUE value. The difference is demonstrated by the following
1038	examples:
1039
1040	URI->new("../../../foo")->abs("http://host/a/b")
1041	==> "http://host/../../foo"
1042
1043	local $URI::ABS_REMOTE_LEADING_DOTS = 1;
1044	URI->new("../../../foo")->abs("http://host/a/b")
1045	==> "http://host/foo"
1046
1047	=item $URI::DEFAULT_QUERY_FORM_DELIMITER
1048
1049	This value can be set to ";" to have the query form C<key=value> pairs
1050	delimited by ";" instead of "&" which is the default.
1051
1052	=back
1053
1054	=head1 BUGS
1055
1056	There are some things that are not quite right:
1057
1058	=over
1059
1060	=item *
1061
1062	Using regexp variables like $1 directly as arguments to the URI accessor methods
1063	does not work too well with current perl implementations. I would argue
1064	that this is actually a bug in perl. The workaround is to quote
1065	them. Example:
1066
1067	/(...)/ \|\| die;
1068	$u->query("$1");
1069
1070
1071	=item *
1072
1073	The escaping (percent encoding) of chars in the 128 .. 255 range passed to the
1074	URI constructor or when setting URI parts using the accessor methods depend on
1075	the state of the internal UTF8 flag (see utf8::is_utf8) of the string passed.
1076	If the UTF8 flag is set the UTF-8 encoded version of the character is percent
1077	encoded. If the UTF8 flag isn't set the Latin-1 version (byte) of the
1078	character is percent encoded. This basically exposes the internal encoding of
1079	Perl strings.
1080
1081	=back
1082
1083	=head1 PARSING URIs WITH REGEXP
1084
1085	As an alternative to this module, the following (official) regular
1086	expression can be used to decode a URI:
1087
1088	my($scheme, $authority, $path, $query, $fragment) =
1089	$uri =~ m\|(?:([^:/?#]+):)?(?://([^/?#]))?([^?#])(?:\?([^#]))?(?:#(.))?\|;
1090
1091	The C<URI::Split> module provides the function uri_split() as a
1092	readable alternative.
1093
1094	=head1 SEE ALSO
1095
1096	L<URI::file>, L<URI::WithBase>, L<URI::QueryParam>, L<URI::Escape>,
1097	L<URI::Split>, L<URI::Heuristic>
1098
1099	RFC 2396: "Uniform Resource Identifiers (URI): Generic Syntax",
1100	Berners-Lee, Fielding, Masinter, August 1998.
1101
1102	L<http://www.iana.org/assignments/uri-schemes>
1103
1104	L<http://www.iana.org/assignments/urn-namespaces>
1105
1106	L<http://www.w3.org/Addressing/>
1107
1108	=head1 COPYRIGHT
1109
1110	Copyright 1995-2009 Gisle Aas.
1111
1112	Copyright 1995 Martijn Koster.
1113
1114	This program is free software; you can redistribute it and/or modify
1115	it under the same terms as Perl itself.
1116
1117	=head1 AUTHORS / ACKNOWLEDGMENTS
1118
1119	This module is based on the C<URI::URL> module, which in turn was
1120	(distantly) based on the C<wwwurl.pl> code in the libwww-perl for
1121	perl4 developed by Roy Fielding, as part of the Arcadia project at the
1122	University of California, Irvine, with contributions from Brooks
1123	Cutter.
1124
1125	C<URI::URL> was developed by Gisle Aas, Tim Bunce, Roy Fielding and
1126	Martijn Koster with input from other people on the libwww-perl mailing
1127	list.
1128
1129	C<URI> and related subclasses was developed by Gisle Aas.
1130
1131	=cut

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/cpan/URI.pm@ 31957

Download in other formats: