1 | package URI::Heuristic;
|
---|
2 |
|
---|
3 | # $Id: Heuristic.pm 720 1999-10-19 03:17:57Z davidb $
|
---|
4 |
|
---|
5 | =head1 NAME
|
---|
6 |
|
---|
7 | uf_urlstr - Expand URL using heuristics
|
---|
8 |
|
---|
9 | =head1 SYNOPSIS
|
---|
10 |
|
---|
11 | use URI::Heuristic qw(uf_urlstr);
|
---|
12 | $url = uf_urlstr("perl"); # http://www.perl.com
|
---|
13 | $url = uf_urlstr("www.sol.no/sol"); # http://www.sol.no/sol
|
---|
14 | $url = uf_urlstr("aas"); # http://www.aas.no
|
---|
15 | $url = uf_urlstr("ftp.funet.fi"); # ftp://ftp.funet.fi
|
---|
16 | $url = uf_urlstr("/etc/passwd"); # file:/etc/passwd
|
---|
17 |
|
---|
18 | =head1 DESCRIPTION
|
---|
19 |
|
---|
20 | This module provide functions that expand strings into real absolute
|
---|
21 | URLs using some builtin heuristics. Strings that already represent
|
---|
22 | absolute URLs (i.e. start with a C<scheme:> part) are never modified
|
---|
23 | and are returned unchanged. The main use of these functions are to
|
---|
24 | allow abbreviated URLs similar to what many web browsers allow for URLs
|
---|
25 | typed in by the user.
|
---|
26 |
|
---|
27 | The following functions are provided:
|
---|
28 |
|
---|
29 | =over 4
|
---|
30 |
|
---|
31 | =item uf_urlstr($str)
|
---|
32 |
|
---|
33 | The uf_urlstr() function will try to make the string passed as
|
---|
34 | argument into a proper absolute URL string. The "uf_" prefix stands
|
---|
35 | for "User Friendly".
|
---|
36 |
|
---|
37 | =item uf_url($str)
|
---|
38 |
|
---|
39 | This functions work the same way as uf_urlstr() but it will
|
---|
40 | return a C<URI::URL> object.
|
---|
41 |
|
---|
42 | =back
|
---|
43 |
|
---|
44 | =head1 ENVIRONMENT
|
---|
45 |
|
---|
46 | If the hostname portion of a URL does not contain any dots, then
|
---|
47 | certain qualified guesses will be made. These guesses are governed be
|
---|
48 | the following two environment variables.
|
---|
49 |
|
---|
50 | =over 10
|
---|
51 |
|
---|
52 | =item COUNTRY
|
---|
53 |
|
---|
54 | This is the two letter country code (ISO 3166) for your location. If
|
---|
55 | the domain name of your host ends with two letters, then it is taken
|
---|
56 | to be the default country. See also L<Locale::Country>.
|
---|
57 |
|
---|
58 | =item URL_GUESS_PATTERN
|
---|
59 |
|
---|
60 | Contain a space separated list of URL patterns to try. The string
|
---|
61 | "ACME" is for some reason used as a placeholder for the host name in
|
---|
62 | the URL provided. Example:
|
---|
63 |
|
---|
64 | URL_GUESS_PATTERN="www.ACME.no www.ACME.se www.ACME.com"
|
---|
65 | export URL_GUESS_PATTERN
|
---|
66 |
|
---|
67 | Specifying URL_GUESS_PATTERN disables any guessing rules based on
|
---|
68 | country. An empty URL_GUESS_PATTERN disables any guessing that
|
---|
69 | involves host name lookups.
|
---|
70 |
|
---|
71 | =back
|
---|
72 |
|
---|
73 | =head1 COPYRIGHT
|
---|
74 |
|
---|
75 | Copyright 1997-1998, Gisle Aas
|
---|
76 |
|
---|
77 | This library is free software; you can redistribute it and/or
|
---|
78 | modify it under the same terms as Perl itself.
|
---|
79 |
|
---|
80 | =cut
|
---|
81 |
|
---|
82 | use strict;
|
---|
83 |
|
---|
84 | use vars qw(@EXPORT_OK $VERSION $MY_COUNTRY %LOCAL_GUESSING $DEBUG);
|
---|
85 |
|
---|
86 | require Exporter;
|
---|
87 | *import = \&Exporter::import;
|
---|
88 | @EXPORT_OK = qw(uf_url uf_urlstr);
|
---|
89 | $VERSION = sprintf("%d.%02d", q$Revision: 720 $ =~ /(\d+)\.(\d+)/);
|
---|
90 |
|
---|
91 | eval {
|
---|
92 | require Net::Domain;
|
---|
93 | my $fqdn = Net::Domain::hostfqdn();
|
---|
94 | $MY_COUNTRY = lc($1) if $fqdn =~ /\.([a-zA-Z]{2})$/;
|
---|
95 |
|
---|
96 | # Some other heuristics to guess country? Perhaps looking
|
---|
97 | # at some environment variable (LANG, LC_ALL, ???)
|
---|
98 | $MY_COUNTRY = $ENV{COUNTRY} if exists $ENV{COUNTRY};
|
---|
99 | };
|
---|
100 |
|
---|
101 | %LOCAL_GUESSING =
|
---|
102 | (
|
---|
103 | 'us' => [qw(www.ACME.gov www.ACME.mil)],
|
---|
104 | 'uk' => [qw(www.ACME.co.uk www.ACME.org.uk www.ACME.ac.uk)],
|
---|
105 | 'au' => [qw(www.ACME.com.au www.ACME.org.au www.ACME.edu.au)],
|
---|
106 | 'il' => [qw(www.ACME.co.il www.ACME.org.il www.ACME.net.il)],
|
---|
107 | # send corrections and new entries to <[email protected]>
|
---|
108 | );
|
---|
109 |
|
---|
110 |
|
---|
111 | sub uf_url ($)
|
---|
112 | {
|
---|
113 | require URI::URL;
|
---|
114 | URI::URL->new(uf_urlstr($_[0]));
|
---|
115 | }
|
---|
116 |
|
---|
117 |
|
---|
118 | sub uf_urlstr ($)
|
---|
119 | {
|
---|
120 | local($_) = @_;
|
---|
121 | print STDERR "uf_urlstr: resolving $_\n" if $DEBUG;
|
---|
122 | return unless defined;
|
---|
123 |
|
---|
124 | s/^\s+//;
|
---|
125 | s/\s+$//;
|
---|
126 |
|
---|
127 | if (/^(www|web|home)\./) {
|
---|
128 | $_ = "http://$_";
|
---|
129 |
|
---|
130 | } elsif (/^(ftp|gopher|news|wais|http|https)\./) {
|
---|
131 | $_ = "$1://$_";
|
---|
132 |
|
---|
133 | } elsif (m,^/, || # absolute file name
|
---|
134 | m,^\.\.?/, || # relative file name
|
---|
135 | m,^[a-zA-Z]:[/\\],) # dosish file name
|
---|
136 | {
|
---|
137 | $_ = "file:$_";
|
---|
138 |
|
---|
139 | } elsif (/^\w+([\.\-]\w+)*\@(\w+\.)+\w{2,3}$/) {
|
---|
140 | $_ = "mailto:$_";
|
---|
141 |
|
---|
142 | } elsif (!/^[.+\-\w]+:/) { # no scheme specified
|
---|
143 | if (s/^(\w+(?:\.\w+)*)([\/:\?\#]|$)/$2/) {
|
---|
144 | my $host = $1;
|
---|
145 |
|
---|
146 | if ($host !~ /\./ && $host ne "localhost") {
|
---|
147 | my @guess;
|
---|
148 | if (exists $ENV{URL_GUESS_PATTERN}) {
|
---|
149 | @guess = map { s/\bACME\b/$host/; $_ }
|
---|
150 | split(' ', $ENV{URL_GUESS_PATTERN});
|
---|
151 | } else {
|
---|
152 | if ($MY_COUNTRY) {
|
---|
153 | my $special = $LOCAL_GUESSING{$MY_COUNTRY};
|
---|
154 | if ($special) {
|
---|
155 | my @special = @$special;
|
---|
156 | push(@guess, map { s/\bACME\b/$host/; $_ }
|
---|
157 | @special);
|
---|
158 | } else {
|
---|
159 | push(@guess, "www.$host.$MY_COUNTRY");
|
---|
160 | }
|
---|
161 | }
|
---|
162 | push(@guess, map "www.$host.$_",
|
---|
163 | "com", "org", "net", "edu", "int");
|
---|
164 | }
|
---|
165 |
|
---|
166 |
|
---|
167 | my $guess;
|
---|
168 | for $guess (@guess) {
|
---|
169 | print STDERR "uf_urlstr: gethostbyname('$guess')..."
|
---|
170 | if $DEBUG;
|
---|
171 | if (gethostbyname($guess)) {
|
---|
172 | print STDERR "yes\n" if $DEBUG;
|
---|
173 | $host = $guess;
|
---|
174 | last;
|
---|
175 | }
|
---|
176 | print STDERR "no\n" if $DEBUG;
|
---|
177 | }
|
---|
178 | }
|
---|
179 | $_ = "http://$host$_";
|
---|
180 |
|
---|
181 | } else {
|
---|
182 | # pure junk, just return it unchanged...
|
---|
183 |
|
---|
184 | }
|
---|
185 | }
|
---|
186 | print STDERR "uf_urlstr: ==> $_\n" if $DEBUG;
|
---|
187 |
|
---|
188 | $_;
|
---|
189 | }
|
---|
190 |
|
---|
191 | 1;
|
---|