[27174] | 1 | package URI::Heuristic;
|
---|
| 2 |
|
---|
| 3 | =head1 NAME
|
---|
| 4 |
|
---|
| 5 | URI::Heuristic - Expand URI using heuristics
|
---|
| 6 |
|
---|
| 7 | =head1 SYNOPSIS
|
---|
| 8 |
|
---|
| 9 | use URI::Heuristic qw(uf_uristr);
|
---|
| 10 | $u = uf_uristr("perl"); # http://www.perl.com
|
---|
| 11 | $u = uf_uristr("www.sol.no/sol"); # http://www.sol.no/sol
|
---|
| 12 | $u = uf_uristr("aas"); # http://www.aas.no
|
---|
| 13 | $u = uf_uristr("ftp.funet.fi"); # ftp://ftp.funet.fi
|
---|
| 14 | $u = uf_uristr("/etc/passwd"); # file:/etc/passwd
|
---|
| 15 |
|
---|
| 16 | =head1 DESCRIPTION
|
---|
| 17 |
|
---|
| 18 | This module provides functions that expand strings into real absolute
|
---|
| 19 | URIs using some built-in heuristics. Strings that already represent
|
---|
| 20 | absolute URIs (i.e. that start with a C<scheme:> part) are never modified
|
---|
| 21 | and are returned unchanged. The main use of these functions is to
|
---|
| 22 | allow abbreviated URIs similar to what many web browsers allow for URIs
|
---|
| 23 | typed in by the user.
|
---|
| 24 |
|
---|
| 25 | The following functions are provided:
|
---|
| 26 |
|
---|
| 27 | =over 4
|
---|
| 28 |
|
---|
| 29 | =item uf_uristr($str)
|
---|
| 30 |
|
---|
| 31 | Tries to make the argument string
|
---|
| 32 | into a proper absolute URI string. The "uf_" prefix stands for "User
|
---|
| 33 | Friendly". Under MacOS, it assumes that any string with a common URL
|
---|
| 34 | scheme (http, ftp, etc.) is a URL rather than a local path. So don't name
|
---|
| 35 | your volumes after common URL schemes and expect uf_uristr() to construct
|
---|
| 36 | valid file: URL's on those volumes for you, because it won't.
|
---|
| 37 |
|
---|
| 38 | =item uf_uri($str)
|
---|
| 39 |
|
---|
| 40 | Works the same way as uf_uristr() but
|
---|
| 41 | returns a C<URI> object.
|
---|
| 42 |
|
---|
| 43 | =back
|
---|
| 44 |
|
---|
| 45 | =head1 ENVIRONMENT
|
---|
| 46 |
|
---|
| 47 | If the hostname portion of a URI does not contain any dots, then
|
---|
| 48 | certain qualified guesses are made. These guesses are governed by
|
---|
| 49 | the following environment variables:
|
---|
| 50 |
|
---|
| 51 | =over 10
|
---|
| 52 |
|
---|
| 53 | =item COUNTRY
|
---|
| 54 |
|
---|
| 55 | The two-letter country code (ISO 3166) for your location. If
|
---|
| 56 | the domain name of your host ends with two letters, then it is taken
|
---|
| 57 | to be the default country. See also L<Locale::Country>.
|
---|
| 58 |
|
---|
| 59 | =item HTTP_ACCEPT_LANGUAGE, LC_ALL, LANG
|
---|
| 60 |
|
---|
| 61 | If COUNTRY is not set, these standard environment variables are
|
---|
| 62 | examined and country (not language) information possibly found in them
|
---|
| 63 | is used as the default country.
|
---|
| 64 |
|
---|
| 65 | =item URL_GUESS_PATTERN
|
---|
| 66 |
|
---|
| 67 | Contains a space-separated list of URL patterns to try. The string
|
---|
| 68 | "ACME" is for some reason used as a placeholder for the host name in
|
---|
| 69 | the URL provided. Example:
|
---|
| 70 |
|
---|
| 71 | URL_GUESS_PATTERN="www.ACME.no www.ACME.se www.ACME.com"
|
---|
| 72 | export URL_GUESS_PATTERN
|
---|
| 73 |
|
---|
| 74 | Specifying URL_GUESS_PATTERN disables any guessing rules based on
|
---|
| 75 | country. An empty URL_GUESS_PATTERN disables any guessing that
|
---|
| 76 | involves host name lookups.
|
---|
| 77 |
|
---|
| 78 | =back
|
---|
| 79 |
|
---|
| 80 | =head1 COPYRIGHT
|
---|
| 81 |
|
---|
| 82 | Copyright 1997-1998, Gisle Aas
|
---|
| 83 |
|
---|
| 84 | This library is free software; you can redistribute it and/or
|
---|
| 85 | modify it under the same terms as Perl itself.
|
---|
| 86 |
|
---|
| 87 | =cut
|
---|
| 88 |
|
---|
| 89 | use strict;
|
---|
| 90 |
|
---|
| 91 | use vars qw(@EXPORT_OK $VERSION $MY_COUNTRY %LOCAL_GUESSING $DEBUG);
|
---|
| 92 |
|
---|
| 93 | require Exporter;
|
---|
| 94 | *import = \&Exporter::import;
|
---|
| 95 | @EXPORT_OK = qw(uf_uri uf_uristr uf_url uf_urlstr);
|
---|
| 96 | $VERSION = "4.20";
|
---|
| 97 |
|
---|
| 98 | sub MY_COUNTRY() {
|
---|
| 99 | for ($MY_COUNTRY) {
|
---|
| 100 | return $_ if defined;
|
---|
| 101 |
|
---|
| 102 | # First try the environment.
|
---|
| 103 | $_ = $ENV{COUNTRY};
|
---|
| 104 | return $_ if defined;
|
---|
| 105 |
|
---|
| 106 | # Try the country part of LC_ALL and LANG from environment
|
---|
| 107 | my @srcs = ($ENV{LC_ALL}, $ENV{LANG});
|
---|
| 108 | # ...and HTTP_ACCEPT_LANGUAGE before those if present
|
---|
| 109 | if (my $httplang = $ENV{HTTP_ACCEPT_LANGUAGE}) {
|
---|
| 110 | # TODO: q-value processing/ordering
|
---|
| 111 | for $httplang (split(/\s*,\s*/, $httplang)) {
|
---|
| 112 | if ($httplang =~ /^\s*([a-zA-Z]+)[_-]([a-zA-Z]{2})\s*$/) {
|
---|
| 113 | unshift(@srcs, "${1}_${2}");
|
---|
| 114 | last;
|
---|
| 115 | }
|
---|
| 116 | }
|
---|
| 117 | }
|
---|
| 118 | for (@srcs) {
|
---|
| 119 | next unless defined;
|
---|
| 120 | return lc($1) if /^[a-zA-Z]+_([a-zA-Z]{2})(?:[.@]|$)/;
|
---|
| 121 | }
|
---|
| 122 |
|
---|
| 123 | # Last bit of domain name. This may access the network.
|
---|
| 124 | require Net::Domain;
|
---|
| 125 | my $fqdn = Net::Domain::hostfqdn();
|
---|
| 126 | $_ = lc($1) if $fqdn =~ /\.([a-zA-Z]{2})$/;
|
---|
| 127 | return $_ if defined;
|
---|
| 128 |
|
---|
| 129 | # Give up. Defined but false.
|
---|
| 130 | return ($_ = 0);
|
---|
| 131 | }
|
---|
| 132 | }
|
---|
| 133 |
|
---|
| 134 | %LOCAL_GUESSING =
|
---|
| 135 | (
|
---|
| 136 | 'us' => [qw(www.ACME.gov www.ACME.mil)],
|
---|
| 137 | 'gb' => [qw(www.ACME.co.uk www.ACME.org.uk www.ACME.ac.uk)],
|
---|
| 138 | 'au' => [qw(www.ACME.com.au www.ACME.org.au www.ACME.edu.au)],
|
---|
| 139 | 'il' => [qw(www.ACME.co.il www.ACME.org.il www.ACME.net.il)],
|
---|
| 140 | # send corrections and new entries to <[email protected]>
|
---|
| 141 | );
|
---|
| 142 | # Backwards compatibility; uk != United Kingdom in ISO 3166
|
---|
| 143 | $LOCAL_GUESSING{uk} = $LOCAL_GUESSING{gb};
|
---|
| 144 |
|
---|
| 145 |
|
---|
| 146 | sub uf_uristr ($)
|
---|
| 147 | {
|
---|
| 148 | local($_) = @_;
|
---|
| 149 | print STDERR "uf_uristr: resolving $_\n" if $DEBUG;
|
---|
| 150 | return unless defined;
|
---|
| 151 |
|
---|
| 152 | s/^\s+//;
|
---|
| 153 | s/\s+$//;
|
---|
| 154 |
|
---|
| 155 | if (/^(www|web|home)[a-z0-9-]*(?:\.|$)/i) {
|
---|
| 156 | $_ = "http://$_";
|
---|
| 157 |
|
---|
| 158 | } elsif (/^(ftp|gopher|news|wais|https|http)[a-z0-9-]*(?:\.|$)/i) {
|
---|
| 159 | $_ = lc($1) . "://$_";
|
---|
| 160 |
|
---|
| 161 | } elsif ($^O ne "MacOS" &&
|
---|
| 162 | (m,^/, || # absolute file name
|
---|
| 163 | m,^\.\.?/, || # relative file name
|
---|
| 164 | m,^[a-zA-Z]:[/\\],) # dosish file name
|
---|
| 165 | )
|
---|
| 166 | {
|
---|
| 167 | $_ = "file:$_";
|
---|
| 168 |
|
---|
| 169 | } elsif ($^O eq "MacOS" && m/:/) {
|
---|
| 170 | # potential MacOS file name
|
---|
| 171 | unless (m/^(ftp|gopher|news|wais|http|https|mailto):/) {
|
---|
| 172 | require URI::file;
|
---|
| 173 | my $a = URI::file->new($_)->as_string;
|
---|
| 174 | $_ = ($a =~ m/^file:/) ? $a : "file:$a";
|
---|
| 175 | }
|
---|
| 176 | } elsif (/^\w+([\.\-]\w+)*\@(\w+\.)+\w{2,3}$/) {
|
---|
| 177 | $_ = "mailto:$_";
|
---|
| 178 |
|
---|
| 179 | } elsif (!/^[a-zA-Z][a-zA-Z0-9.+\-]*:/) { # no scheme specified
|
---|
| 180 | if (s/^([-\w]+(?:\.[-\w]+)*)([\/:\?\#]|$)/$2/) {
|
---|
| 181 | my $host = $1;
|
---|
| 182 |
|
---|
| 183 | my $scheme = "http";
|
---|
| 184 | if (/^:(\d+)\b/) {
|
---|
| 185 | # Some more or less well known ports
|
---|
| 186 | if ($1 =~ /^[56789]?443$/) {
|
---|
| 187 | $scheme = "https";
|
---|
| 188 | } elsif ($1 eq "21") {
|
---|
| 189 | $scheme = "ftp";
|
---|
| 190 | }
|
---|
| 191 | }
|
---|
| 192 |
|
---|
| 193 | if ($host !~ /\./ && $host ne "localhost") {
|
---|
| 194 | my @guess;
|
---|
| 195 | if (exists $ENV{URL_GUESS_PATTERN}) {
|
---|
| 196 | @guess = map { s/\bACME\b/$host/; $_ }
|
---|
| 197 | split(' ', $ENV{URL_GUESS_PATTERN});
|
---|
| 198 | } else {
|
---|
| 199 | if (MY_COUNTRY()) {
|
---|
| 200 | my $special = $LOCAL_GUESSING{MY_COUNTRY()};
|
---|
| 201 | if ($special) {
|
---|
| 202 | my @special = @$special;
|
---|
| 203 | push(@guess, map { s/\bACME\b/$host/; $_ }
|
---|
| 204 | @special);
|
---|
| 205 | } else {
|
---|
| 206 | push(@guess, "www.$host." . MY_COUNTRY());
|
---|
| 207 | }
|
---|
| 208 | }
|
---|
| 209 | push(@guess, map "www.$host.$_",
|
---|
| 210 | "com", "org", "net", "edu", "int");
|
---|
| 211 | }
|
---|
| 212 |
|
---|
| 213 |
|
---|
| 214 | my $guess;
|
---|
| 215 | for $guess (@guess) {
|
---|
| 216 | print STDERR "uf_uristr: gethostbyname('$guess.')..."
|
---|
| 217 | if $DEBUG;
|
---|
| 218 | if (gethostbyname("$guess.")) {
|
---|
| 219 | print STDERR "yes\n" if $DEBUG;
|
---|
| 220 | $host = $guess;
|
---|
| 221 | last;
|
---|
| 222 | }
|
---|
| 223 | print STDERR "no\n" if $DEBUG;
|
---|
| 224 | }
|
---|
| 225 | }
|
---|
| 226 | $_ = "$scheme://$host$_";
|
---|
| 227 |
|
---|
| 228 | } else {
|
---|
| 229 | # pure junk, just return it unchanged...
|
---|
| 230 |
|
---|
| 231 | }
|
---|
| 232 | }
|
---|
| 233 | print STDERR "uf_uristr: ==> $_\n" if $DEBUG;
|
---|
| 234 |
|
---|
| 235 | $_;
|
---|
| 236 | }
|
---|
| 237 |
|
---|
| 238 | sub uf_uri ($)
|
---|
| 239 | {
|
---|
| 240 | require URI;
|
---|
| 241 | URI->new(uf_uristr($_[0]));
|
---|
| 242 | }
|
---|
| 243 |
|
---|
| 244 | # legacy
|
---|
| 245 | *uf_urlstr = \*uf_uristr;
|
---|
| 246 |
|
---|
| 247 | sub uf_url ($)
|
---|
| 248 | {
|
---|
| 249 | require URI::URL;
|
---|
| 250 | URI::URL->new(uf_uristr($_[0]));
|
---|
| 251 | }
|
---|
| 252 |
|
---|
| 253 | 1;
|
---|