1 | package URI::Heuristic;
|
---|
2 |
|
---|
3 | =head1 NAME
|
---|
4 |
|
---|
5 | URI::Heuristic - Expand URI using heuristics
|
---|
6 |
|
---|
7 | =head1 SYNOPSIS
|
---|
8 |
|
---|
9 | use URI::Heuristic qw(uf_uristr);
|
---|
10 | $u = uf_uristr("perl"); # http://www.perl.com
|
---|
11 | $u = uf_uristr("www.sol.no/sol"); # http://www.sol.no/sol
|
---|
12 | $u = uf_uristr("aas"); # http://www.aas.no
|
---|
13 | $u = uf_uristr("ftp.funet.fi"); # ftp://ftp.funet.fi
|
---|
14 | $u = uf_uristr("/etc/passwd"); # file:/etc/passwd
|
---|
15 |
|
---|
16 | =head1 DESCRIPTION
|
---|
17 |
|
---|
18 | This module provides functions that expand strings into real absolute
|
---|
19 | URIs using some built-in heuristics. Strings that already represent
|
---|
20 | absolute URIs (i.e. that start with a C<scheme:> part) are never modified
|
---|
21 | and are returned unchanged. The main use of these functions is to
|
---|
22 | allow abbreviated URIs similar to what many web browsers allow for URIs
|
---|
23 | typed in by the user.
|
---|
24 |
|
---|
25 | The following functions are provided:
|
---|
26 |
|
---|
27 | =over 4
|
---|
28 |
|
---|
29 | =item uf_uristr($str)
|
---|
30 |
|
---|
31 | Tries to make the argument string
|
---|
32 | into a proper absolute URI string. The "uf_" prefix stands for "User
|
---|
33 | Friendly". Under MacOS, it assumes that any string with a common URL
|
---|
34 | scheme (http, ftp, etc.) is a URL rather than a local path. So don't name
|
---|
35 | your volumes after common URL schemes and expect uf_uristr() to construct
|
---|
36 | valid file: URL's on those volumes for you, because it won't.
|
---|
37 |
|
---|
38 | =item uf_uri($str)
|
---|
39 |
|
---|
40 | Works the same way as uf_uristr() but
|
---|
41 | returns a C<URI> object.
|
---|
42 |
|
---|
43 | =back
|
---|
44 |
|
---|
45 | =head1 ENVIRONMENT
|
---|
46 |
|
---|
47 | If the hostname portion of a URI does not contain any dots, then
|
---|
48 | certain qualified guesses are made. These guesses are governed by
|
---|
49 | the following environment variables:
|
---|
50 |
|
---|
51 | =over 10
|
---|
52 |
|
---|
53 | =item COUNTRY
|
---|
54 |
|
---|
55 | The two-letter country code (ISO 3166) for your location. If
|
---|
56 | the domain name of your host ends with two letters, then it is taken
|
---|
57 | to be the default country. See also L<Locale::Country>.
|
---|
58 |
|
---|
59 | =item HTTP_ACCEPT_LANGUAGE, LC_ALL, LANG
|
---|
60 |
|
---|
61 | If COUNTRY is not set, these standard environment variables are
|
---|
62 | examined and country (not language) information possibly found in them
|
---|
63 | is used as the default country.
|
---|
64 |
|
---|
65 | =item URL_GUESS_PATTERN
|
---|
66 |
|
---|
67 | Contains a space-separated list of URL patterns to try. The string
|
---|
68 | "ACME" is for some reason used as a placeholder for the host name in
|
---|
69 | the URL provided. Example:
|
---|
70 |
|
---|
71 | URL_GUESS_PATTERN="www.ACME.no www.ACME.se www.ACME.com"
|
---|
72 | export URL_GUESS_PATTERN
|
---|
73 |
|
---|
74 | Specifying URL_GUESS_PATTERN disables any guessing rules based on
|
---|
75 | country. An empty URL_GUESS_PATTERN disables any guessing that
|
---|
76 | involves host name lookups.
|
---|
77 |
|
---|
78 | =back
|
---|
79 |
|
---|
80 | =head1 COPYRIGHT
|
---|
81 |
|
---|
82 | Copyright 1997-1998, Gisle Aas
|
---|
83 |
|
---|
84 | This library is free software; you can redistribute it and/or
|
---|
85 | modify it under the same terms as Perl itself.
|
---|
86 |
|
---|
87 | =cut
|
---|
88 |
|
---|
89 | use strict;
|
---|
90 |
|
---|
91 | use vars qw(@EXPORT_OK $VERSION $MY_COUNTRY %LOCAL_GUESSING $DEBUG);
|
---|
92 |
|
---|
93 | require Exporter;
|
---|
94 | *import = \&Exporter::import;
|
---|
95 | @EXPORT_OK = qw(uf_uri uf_uristr uf_url uf_urlstr);
|
---|
96 | $VERSION = "4.20";
|
---|
97 |
|
---|
98 | sub MY_COUNTRY() {
|
---|
99 | for ($MY_COUNTRY) {
|
---|
100 | return $_ if defined;
|
---|
101 |
|
---|
102 | # First try the environment.
|
---|
103 | $_ = $ENV{COUNTRY};
|
---|
104 | return $_ if defined;
|
---|
105 |
|
---|
106 | # Try the country part of LC_ALL and LANG from environment
|
---|
107 | my @srcs = ($ENV{LC_ALL}, $ENV{LANG});
|
---|
108 | # ...and HTTP_ACCEPT_LANGUAGE before those if present
|
---|
109 | if (my $httplang = $ENV{HTTP_ACCEPT_LANGUAGE}) {
|
---|
110 | # TODO: q-value processing/ordering
|
---|
111 | for $httplang (split(/\s*,\s*/, $httplang)) {
|
---|
112 | if ($httplang =~ /^\s*([a-zA-Z]+)[_-]([a-zA-Z]{2})\s*$/) {
|
---|
113 | unshift(@srcs, "${1}_${2}");
|
---|
114 | last;
|
---|
115 | }
|
---|
116 | }
|
---|
117 | }
|
---|
118 | for (@srcs) {
|
---|
119 | next unless defined;
|
---|
120 | return lc($1) if /^[a-zA-Z]+_([a-zA-Z]{2})(?:[.@]|$)/;
|
---|
121 | }
|
---|
122 |
|
---|
123 | # Last bit of domain name. This may access the network.
|
---|
124 | require Net::Domain;
|
---|
125 | my $fqdn = Net::Domain::hostfqdn();
|
---|
126 | $_ = lc($1) if $fqdn =~ /\.([a-zA-Z]{2})$/;
|
---|
127 | return $_ if defined;
|
---|
128 |
|
---|
129 | # Give up. Defined but false.
|
---|
130 | return ($_ = 0);
|
---|
131 | }
|
---|
132 | }
|
---|
133 |
|
---|
134 | %LOCAL_GUESSING =
|
---|
135 | (
|
---|
136 | 'us' => [qw(www.ACME.gov www.ACME.mil)],
|
---|
137 | 'gb' => [qw(www.ACME.co.uk www.ACME.org.uk www.ACME.ac.uk)],
|
---|
138 | 'au' => [qw(www.ACME.com.au www.ACME.org.au www.ACME.edu.au)],
|
---|
139 | 'il' => [qw(www.ACME.co.il www.ACME.org.il www.ACME.net.il)],
|
---|
140 | # send corrections and new entries to <[email protected]>
|
---|
141 | );
|
---|
142 | # Backwards compatibility; uk != United Kingdom in ISO 3166
|
---|
143 | $LOCAL_GUESSING{uk} = $LOCAL_GUESSING{gb};
|
---|
144 |
|
---|
145 |
|
---|
146 | sub uf_uristr ($)
|
---|
147 | {
|
---|
148 | local($_) = @_;
|
---|
149 | print STDERR "uf_uristr: resolving $_\n" if $DEBUG;
|
---|
150 | return unless defined;
|
---|
151 |
|
---|
152 | s/^\s+//;
|
---|
153 | s/\s+$//;
|
---|
154 |
|
---|
155 | if (/^(www|web|home)[a-z0-9-]*(?:\.|$)/i) {
|
---|
156 | $_ = "http://$_";
|
---|
157 |
|
---|
158 | } elsif (/^(ftp|gopher|news|wais|https|http)[a-z0-9-]*(?:\.|$)/i) {
|
---|
159 | $_ = lc($1) . "://$_";
|
---|
160 |
|
---|
161 | } elsif ($^O ne "MacOS" &&
|
---|
162 | (m,^/, || # absolute file name
|
---|
163 | m,^\.\.?/, || # relative file name
|
---|
164 | m,^[a-zA-Z]:[/\\],) # dosish file name
|
---|
165 | )
|
---|
166 | {
|
---|
167 | $_ = "file:$_";
|
---|
168 |
|
---|
169 | } elsif ($^O eq "MacOS" && m/:/) {
|
---|
170 | # potential MacOS file name
|
---|
171 | unless (m/^(ftp|gopher|news|wais|http|https|mailto):/) {
|
---|
172 | require URI::file;
|
---|
173 | my $a = URI::file->new($_)->as_string;
|
---|
174 | $_ = ($a =~ m/^file:/) ? $a : "file:$a";
|
---|
175 | }
|
---|
176 | } elsif (/^\w+([\.\-]\w+)*\@(\w+\.)+\w{2,3}$/) {
|
---|
177 | $_ = "mailto:$_";
|
---|
178 |
|
---|
179 | } elsif (!/^[a-zA-Z][a-zA-Z0-9.+\-]*:/) { # no scheme specified
|
---|
180 | if (s/^([-\w]+(?:\.[-\w]+)*)([\/:\?\#]|$)/$2/) {
|
---|
181 | my $host = $1;
|
---|
182 |
|
---|
183 | my $scheme = "http";
|
---|
184 | if (/^:(\d+)\b/) {
|
---|
185 | # Some more or less well known ports
|
---|
186 | if ($1 =~ /^[56789]?443$/) {
|
---|
187 | $scheme = "https";
|
---|
188 | } elsif ($1 eq "21") {
|
---|
189 | $scheme = "ftp";
|
---|
190 | }
|
---|
191 | }
|
---|
192 |
|
---|
193 | if ($host !~ /\./ && $host ne "localhost") {
|
---|
194 | my @guess;
|
---|
195 | if (exists $ENV{URL_GUESS_PATTERN}) {
|
---|
196 | @guess = map { s/\bACME\b/$host/; $_ }
|
---|
197 | split(' ', $ENV{URL_GUESS_PATTERN});
|
---|
198 | } else {
|
---|
199 | if (MY_COUNTRY()) {
|
---|
200 | my $special = $LOCAL_GUESSING{MY_COUNTRY()};
|
---|
201 | if ($special) {
|
---|
202 | my @special = @$special;
|
---|
203 | push(@guess, map { s/\bACME\b/$host/; $_ }
|
---|
204 | @special);
|
---|
205 | } else {
|
---|
206 | push(@guess, "www.$host." . MY_COUNTRY());
|
---|
207 | }
|
---|
208 | }
|
---|
209 | push(@guess, map "www.$host.$_",
|
---|
210 | "com", "org", "net", "edu", "int");
|
---|
211 | }
|
---|
212 |
|
---|
213 |
|
---|
214 | my $guess;
|
---|
215 | for $guess (@guess) {
|
---|
216 | print STDERR "uf_uristr: gethostbyname('$guess.')..."
|
---|
217 | if $DEBUG;
|
---|
218 | if (gethostbyname("$guess.")) {
|
---|
219 | print STDERR "yes\n" if $DEBUG;
|
---|
220 | $host = $guess;
|
---|
221 | last;
|
---|
222 | }
|
---|
223 | print STDERR "no\n" if $DEBUG;
|
---|
224 | }
|
---|
225 | }
|
---|
226 | $_ = "$scheme://$host$_";
|
---|
227 |
|
---|
228 | } else {
|
---|
229 | # pure junk, just return it unchanged...
|
---|
230 |
|
---|
231 | }
|
---|
232 | }
|
---|
233 | print STDERR "uf_uristr: ==> $_\n" if $DEBUG;
|
---|
234 |
|
---|
235 | $_;
|
---|
236 | }
|
---|
237 |
|
---|
238 | sub uf_uri ($)
|
---|
239 | {
|
---|
240 | require URI;
|
---|
241 | URI->new(uf_uristr($_[0]));
|
---|
242 | }
|
---|
243 |
|
---|
244 | # legacy
|
---|
245 | *uf_urlstr = \*uf_uristr;
|
---|
246 |
|
---|
247 | sub uf_url ($)
|
---|
248 | {
|
---|
249 | require URI::URL;
|
---|
250 | URI::URL->new(uf_uristr($_[0]));
|
---|
251 | }
|
---|
252 |
|
---|
253 | 1;
|
---|