1 | package WWW::RobotRules;
|
---|
2 |
|
---|
3 | $VERSION = "5.832";
|
---|
4 | sub Version { $VERSION; }
|
---|
5 |
|
---|
6 | use strict;
|
---|
7 | use URI ();
|
---|
8 |
|
---|
9 |
|
---|
10 |
|
---|
11 | sub new {
|
---|
12 | my($class, $ua) = @_;
|
---|
13 |
|
---|
14 | # This ugly hack is needed to ensure backwards compatibility.
|
---|
15 | # The "WWW::RobotRules" class is now really abstract.
|
---|
16 | $class = "WWW::RobotRules::InCore" if $class eq "WWW::RobotRules";
|
---|
17 |
|
---|
18 | my $self = bless { }, $class;
|
---|
19 | $self->agent($ua);
|
---|
20 | $self;
|
---|
21 | }
|
---|
22 |
|
---|
23 |
|
---|
24 | sub parse {
|
---|
25 | my($self, $robot_txt_uri, $txt, $fresh_until) = @_;
|
---|
26 | $robot_txt_uri = URI->new("$robot_txt_uri");
|
---|
27 | my $netloc = $robot_txt_uri->host . ":" . $robot_txt_uri->port;
|
---|
28 |
|
---|
29 | $self->clear_rules($netloc);
|
---|
30 | $self->fresh_until($netloc, $fresh_until || (time + 365*24*3600));
|
---|
31 |
|
---|
32 | my $ua;
|
---|
33 | my $is_me = 0; # 1 iff this record is for me
|
---|
34 | my $is_anon = 0; # 1 iff this record is for *
|
---|
35 | my $seen_disallow = 0; # watch for missing record separators
|
---|
36 | my @me_disallowed = (); # rules disallowed for me
|
---|
37 | my @anon_disallowed = (); # rules disallowed for *
|
---|
38 |
|
---|
39 | # blank lines are significant, so turn CRLF into LF to avoid generating
|
---|
40 | # false ones
|
---|
41 | $txt =~ s/\015\012/\012/g;
|
---|
42 |
|
---|
43 | # split at \012 (LF) or \015 (CR) (Mac text files have just CR for EOL)
|
---|
44 | for(split(/[\012\015]/, $txt)) {
|
---|
45 |
|
---|
46 | # Lines containing only a comment are discarded completely, and
|
---|
47 | # therefore do not indicate a record boundary.
|
---|
48 | next if /^\s*\#/;
|
---|
49 |
|
---|
50 | s/\s*\#.*//; # remove comments at end-of-line
|
---|
51 |
|
---|
52 | if (/^\s*$/) { # blank line
|
---|
53 | last if $is_me; # That was our record. No need to read the rest.
|
---|
54 | $is_anon = 0;
|
---|
55 | $seen_disallow = 0;
|
---|
56 | }
|
---|
57 | elsif (/^\s*User-Agent\s*:\s*(.*)/i) {
|
---|
58 | $ua = $1;
|
---|
59 | $ua =~ s/\s+$//;
|
---|
60 |
|
---|
61 | if ($seen_disallow) {
|
---|
62 | # treat as start of a new record
|
---|
63 | $seen_disallow = 0;
|
---|
64 | last if $is_me; # That was our record. No need to read the rest.
|
---|
65 | $is_anon = 0;
|
---|
66 | }
|
---|
67 |
|
---|
68 | if ($is_me) {
|
---|
69 | # This record already had a User-agent that
|
---|
70 | # we matched, so just continue.
|
---|
71 | }
|
---|
72 | elsif ($ua eq '*') {
|
---|
73 | $is_anon = 1;
|
---|
74 | }
|
---|
75 | elsif($self->is_me($ua)) {
|
---|
76 | $is_me = 1;
|
---|
77 | }
|
---|
78 | }
|
---|
79 | elsif (/^\s*Disallow\s*:\s*(.*)/i) {
|
---|
80 | unless (defined $ua) {
|
---|
81 | warn "RobotRules <$robot_txt_uri>: Disallow without preceding User-agent\n" if $^W;
|
---|
82 | $is_anon = 1; # assume that User-agent: * was intended
|
---|
83 | }
|
---|
84 | my $disallow = $1;
|
---|
85 | $disallow =~ s/\s+$//;
|
---|
86 | $seen_disallow = 1;
|
---|
87 | if (length $disallow) {
|
---|
88 | my $ignore;
|
---|
89 | eval {
|
---|
90 | my $u = URI->new_abs($disallow, $robot_txt_uri);
|
---|
91 | $ignore++ if $u->scheme ne $robot_txt_uri->scheme;
|
---|
92 | $ignore++ if lc($u->host) ne lc($robot_txt_uri->host);
|
---|
93 | $ignore++ if $u->port ne $robot_txt_uri->port;
|
---|
94 | $disallow = $u->path_query;
|
---|
95 | $disallow = "/" unless length $disallow;
|
---|
96 | };
|
---|
97 | next if $@;
|
---|
98 | next if $ignore;
|
---|
99 | }
|
---|
100 |
|
---|
101 | if ($is_me) {
|
---|
102 | push(@me_disallowed, $disallow);
|
---|
103 | }
|
---|
104 | elsif ($is_anon) {
|
---|
105 | push(@anon_disallowed, $disallow);
|
---|
106 | }
|
---|
107 | }
|
---|
108 | elsif (/\S\s*:/) {
|
---|
109 | # ignore
|
---|
110 | }
|
---|
111 | else {
|
---|
112 | warn "RobotRules <$robot_txt_uri>: Malformed record: <$_>\n" if $^W;
|
---|
113 | }
|
---|
114 | }
|
---|
115 |
|
---|
116 | if ($is_me) {
|
---|
117 | $self->push_rules($netloc, @me_disallowed);
|
---|
118 | }
|
---|
119 | else {
|
---|
120 | $self->push_rules($netloc, @anon_disallowed);
|
---|
121 | }
|
---|
122 | }
|
---|
123 |
|
---|
124 |
|
---|
125 | #
|
---|
126 | # Returns TRUE if the given name matches the
|
---|
127 | # name of this robot
|
---|
128 | #
|
---|
129 | sub is_me {
|
---|
130 | my($self, $ua_line) = @_;
|
---|
131 | my $me = $self->agent;
|
---|
132 |
|
---|
133 | # See whether my short-name is a substring of the
|
---|
134 | # "User-Agent: ..." line that we were passed:
|
---|
135 |
|
---|
136 | if(index(lc($me), lc($ua_line)) >= 0) {
|
---|
137 | return 1;
|
---|
138 | }
|
---|
139 | else {
|
---|
140 | return '';
|
---|
141 | }
|
---|
142 | }
|
---|
143 |
|
---|
144 |
|
---|
145 | sub allowed {
|
---|
146 | my($self, $uri) = @_;
|
---|
147 | $uri = URI->new("$uri");
|
---|
148 |
|
---|
149 | return 1 unless $uri->scheme eq 'http' or $uri->scheme eq 'https';
|
---|
150 | # Robots.txt applies to only those schemes.
|
---|
151 |
|
---|
152 | my $netloc = $uri->host . ":" . $uri->port;
|
---|
153 |
|
---|
154 | my $fresh_until = $self->fresh_until($netloc);
|
---|
155 | return -1 if !defined($fresh_until) || $fresh_until < time;
|
---|
156 |
|
---|
157 | my $str = $uri->path_query;
|
---|
158 | my $rule;
|
---|
159 | for $rule ($self->rules($netloc)) {
|
---|
160 | return 1 unless length $rule;
|
---|
161 | return 0 if index($str, $rule) == 0;
|
---|
162 | }
|
---|
163 | return 1;
|
---|
164 | }
|
---|
165 |
|
---|
166 |
|
---|
167 | # The following methods must be provided by the subclass.
|
---|
168 | sub agent;
|
---|
169 | sub visit;
|
---|
170 | sub no_visits;
|
---|
171 | sub last_visits;
|
---|
172 | sub fresh_until;
|
---|
173 | sub push_rules;
|
---|
174 | sub clear_rules;
|
---|
175 | sub rules;
|
---|
176 | sub dump;
|
---|
177 |
|
---|
178 |
|
---|
179 |
|
---|
180 | package WWW::RobotRules::InCore;
|
---|
181 |
|
---|
182 | use vars qw(@ISA);
|
---|
183 | @ISA = qw(WWW::RobotRules);
|
---|
184 |
|
---|
185 |
|
---|
186 |
|
---|
187 | sub agent {
|
---|
188 | my ($self, $name) = @_;
|
---|
189 | my $old = $self->{'ua'};
|
---|
190 | if ($name) {
|
---|
191 | # Strip it so that it's just the short name.
|
---|
192 | # I.e., "FooBot" => "FooBot"
|
---|
193 | # "FooBot/1.2" => "FooBot"
|
---|
194 | # "FooBot/1.2 [http://foobot.int; [email protected]]" => "FooBot"
|
---|
195 |
|
---|
196 | $name = $1 if $name =~ m/(\S+)/; # get first word
|
---|
197 | $name =~ s!/.*!!; # get rid of version
|
---|
198 | unless ($old && $old eq $name) {
|
---|
199 | delete $self->{'loc'}; # all old info is now stale
|
---|
200 | $self->{'ua'} = $name;
|
---|
201 | }
|
---|
202 | }
|
---|
203 | $old;
|
---|
204 | }
|
---|
205 |
|
---|
206 |
|
---|
207 | sub visit {
|
---|
208 | my($self, $netloc, $time) = @_;
|
---|
209 | return unless $netloc;
|
---|
210 | $time ||= time;
|
---|
211 | $self->{'loc'}{$netloc}{'last'} = $time;
|
---|
212 | my $count = \$self->{'loc'}{$netloc}{'count'};
|
---|
213 | if (!defined $$count) {
|
---|
214 | $$count = 1;
|
---|
215 | }
|
---|
216 | else {
|
---|
217 | $$count++;
|
---|
218 | }
|
---|
219 | }
|
---|
220 |
|
---|
221 |
|
---|
222 | sub no_visits {
|
---|
223 | my ($self, $netloc) = @_;
|
---|
224 | $self->{'loc'}{$netloc}{'count'};
|
---|
225 | }
|
---|
226 |
|
---|
227 |
|
---|
228 | sub last_visit {
|
---|
229 | my ($self, $netloc) = @_;
|
---|
230 | $self->{'loc'}{$netloc}{'last'};
|
---|
231 | }
|
---|
232 |
|
---|
233 |
|
---|
234 | sub fresh_until {
|
---|
235 | my ($self, $netloc, $fresh_until) = @_;
|
---|
236 | my $old = $self->{'loc'}{$netloc}{'fresh'};
|
---|
237 | if (defined $fresh_until) {
|
---|
238 | $self->{'loc'}{$netloc}{'fresh'} = $fresh_until;
|
---|
239 | }
|
---|
240 | $old;
|
---|
241 | }
|
---|
242 |
|
---|
243 |
|
---|
244 | sub push_rules {
|
---|
245 | my($self, $netloc, @rules) = @_;
|
---|
246 | push (@{$self->{'loc'}{$netloc}{'rules'}}, @rules);
|
---|
247 | }
|
---|
248 |
|
---|
249 |
|
---|
250 | sub clear_rules {
|
---|
251 | my($self, $netloc) = @_;
|
---|
252 | delete $self->{'loc'}{$netloc}{'rules'};
|
---|
253 | }
|
---|
254 |
|
---|
255 |
|
---|
256 | sub rules {
|
---|
257 | my($self, $netloc) = @_;
|
---|
258 | if (defined $self->{'loc'}{$netloc}{'rules'}) {
|
---|
259 | return @{$self->{'loc'}{$netloc}{'rules'}};
|
---|
260 | }
|
---|
261 | else {
|
---|
262 | return ();
|
---|
263 | }
|
---|
264 | }
|
---|
265 |
|
---|
266 |
|
---|
267 | sub dump
|
---|
268 | {
|
---|
269 | my $self = shift;
|
---|
270 | for (keys %$self) {
|
---|
271 | next if $_ eq 'loc';
|
---|
272 | print "$_ = $self->{$_}\n";
|
---|
273 | }
|
---|
274 | for (keys %{$self->{'loc'}}) {
|
---|
275 | my @rules = $self->rules($_);
|
---|
276 | print "$_: ", join("; ", @rules), "\n";
|
---|
277 | }
|
---|
278 | }
|
---|
279 |
|
---|
280 |
|
---|
281 | 1;
|
---|
282 |
|
---|
283 | __END__
|
---|
284 |
|
---|
285 |
|
---|
286 | # Bender: "Well, I don't have anything else
|
---|
287 | # planned for today. Let's get drunk!"
|
---|
288 |
|
---|
289 | =head1 NAME
|
---|
290 |
|
---|
291 | WWW::RobotRules - database of robots.txt-derived permissions
|
---|
292 |
|
---|
293 | =head1 SYNOPSIS
|
---|
294 |
|
---|
295 | use WWW::RobotRules;
|
---|
296 | my $rules = WWW::RobotRules->new('MOMspider/1.0');
|
---|
297 |
|
---|
298 | use LWP::Simple qw(get);
|
---|
299 |
|
---|
300 | {
|
---|
301 | my $url = "http://some.place/robots.txt";
|
---|
302 | my $robots_txt = get $url;
|
---|
303 | $rules->parse($url, $robots_txt) if defined $robots_txt;
|
---|
304 | }
|
---|
305 |
|
---|
306 | {
|
---|
307 | my $url = "http://some.other.place/robots.txt";
|
---|
308 | my $robots_txt = get $url;
|
---|
309 | $rules->parse($url, $robots_txt) if defined $robots_txt;
|
---|
310 | }
|
---|
311 |
|
---|
312 | # Now we can check if a URL is valid for those servers
|
---|
313 | # whose "robots.txt" files we've gotten and parsed:
|
---|
314 | if($rules->allowed($url)) {
|
---|
315 | $c = get $url;
|
---|
316 | ...
|
---|
317 | }
|
---|
318 |
|
---|
319 | =head1 DESCRIPTION
|
---|
320 |
|
---|
321 | This module parses F</robots.txt> files as specified in
|
---|
322 | "A Standard for Robot Exclusion", at
|
---|
323 | <http://www.robotstxt.org/wc/norobots.html>
|
---|
324 | Webmasters can use the F</robots.txt> file to forbid conforming
|
---|
325 | robots from accessing parts of their web site.
|
---|
326 |
|
---|
327 | The parsed files are kept in a WWW::RobotRules object, and this object
|
---|
328 | provides methods to check if access to a given URL is prohibited. The
|
---|
329 | same WWW::RobotRules object can be used for one or more parsed
|
---|
330 | F</robots.txt> files on any number of hosts.
|
---|
331 |
|
---|
332 | The following methods are provided:
|
---|
333 |
|
---|
334 | =over 4
|
---|
335 |
|
---|
336 | =item $rules = WWW::RobotRules->new($robot_name)
|
---|
337 |
|
---|
338 | This is the constructor for WWW::RobotRules objects. The first
|
---|
339 | argument given to new() is the name of the robot.
|
---|
340 |
|
---|
341 | =item $rules->parse($robot_txt_url, $content, $fresh_until)
|
---|
342 |
|
---|
343 | The parse() method takes as arguments the URL that was used to
|
---|
344 | retrieve the F</robots.txt> file, and the contents of the file.
|
---|
345 |
|
---|
346 | =item $rules->allowed($uri)
|
---|
347 |
|
---|
348 | Returns TRUE if this robot is allowed to retrieve this URL.
|
---|
349 |
|
---|
350 | =item $rules->agent([$name])
|
---|
351 |
|
---|
352 | Get/set the agent name. NOTE: Changing the agent name will clear the robots.txt
|
---|
353 | rules and expire times out of the cache.
|
---|
354 |
|
---|
355 | =back
|
---|
356 |
|
---|
357 | =head1 ROBOTS.TXT
|
---|
358 |
|
---|
359 | The format and semantics of the "/robots.txt" file are as follows
|
---|
360 | (this is an edited abstract of
|
---|
361 | <http://www.robotstxt.org/wc/norobots.html>):
|
---|
362 |
|
---|
363 | The file consists of one or more records separated by one or more
|
---|
364 | blank lines. Each record contains lines of the form
|
---|
365 |
|
---|
366 | <field-name>: <value>
|
---|
367 |
|
---|
368 | The field name is case insensitive. Text after the '#' character on a
|
---|
369 | line is ignored during parsing. This is used for comments. The
|
---|
370 | following <field-names> can be used:
|
---|
371 |
|
---|
372 | =over 3
|
---|
373 |
|
---|
374 | =item User-Agent
|
---|
375 |
|
---|
376 | The value of this field is the name of the robot the record is
|
---|
377 | describing access policy for. If more than one I<User-Agent> field is
|
---|
378 | present the record describes an identical access policy for more than
|
---|
379 | one robot. At least one field needs to be present per record. If the
|
---|
380 | value is '*', the record describes the default access policy for any
|
---|
381 | robot that has not not matched any of the other records.
|
---|
382 |
|
---|
383 | The I<User-Agent> fields must occur before the I<Disallow> fields. If a
|
---|
384 | record contains a I<User-Agent> field after a I<Disallow> field, that
|
---|
385 | constitutes a malformed record. This parser will assume that a blank
|
---|
386 | line should have been placed before that I<User-Agent> field, and will
|
---|
387 | break the record into two. All the fields before the I<User-Agent> field
|
---|
388 | will constitute a record, and the I<User-Agent> field will be the first
|
---|
389 | field in a new record.
|
---|
390 |
|
---|
391 | =item Disallow
|
---|
392 |
|
---|
393 | The value of this field specifies a partial URL that is not to be
|
---|
394 | visited. This can be a full path, or a partial path; any URL that
|
---|
395 | starts with this value will not be retrieved
|
---|
396 |
|
---|
397 | =back
|
---|
398 |
|
---|
399 | Unrecognized records are ignored.
|
---|
400 |
|
---|
401 | =head1 ROBOTS.TXT EXAMPLES
|
---|
402 |
|
---|
403 | The following example "/robots.txt" file specifies that no robots
|
---|
404 | should visit any URL starting with "/cyberworld/map/" or "/tmp/":
|
---|
405 |
|
---|
406 | User-agent: *
|
---|
407 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space
|
---|
408 | Disallow: /tmp/ # these will soon disappear
|
---|
409 |
|
---|
410 | This example "/robots.txt" file specifies that no robots should visit
|
---|
411 | any URL starting with "/cyberworld/map/", except the robot called
|
---|
412 | "cybermapper":
|
---|
413 |
|
---|
414 | User-agent: *
|
---|
415 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space
|
---|
416 |
|
---|
417 | # Cybermapper knows where to go.
|
---|
418 | User-agent: cybermapper
|
---|
419 | Disallow:
|
---|
420 |
|
---|
421 | This example indicates that no robots should visit this site further:
|
---|
422 |
|
---|
423 | # go away
|
---|
424 | User-agent: *
|
---|
425 | Disallow: /
|
---|
426 |
|
---|
427 | This is an example of a malformed robots.txt file.
|
---|
428 |
|
---|
429 | # robots.txt for ancientcastle.example.com
|
---|
430 | # I've locked myself away.
|
---|
431 | User-agent: *
|
---|
432 | Disallow: /
|
---|
433 | # The castle is your home now, so you can go anywhere you like.
|
---|
434 | User-agent: Belle
|
---|
435 | Disallow: /west-wing/ # except the west wing!
|
---|
436 | # It's good to be the Prince...
|
---|
437 | User-agent: Beast
|
---|
438 | Disallow:
|
---|
439 |
|
---|
440 | This file is missing the required blank lines between records.
|
---|
441 | However, the intention is clear.
|
---|
442 |
|
---|
443 | =head1 SEE ALSO
|
---|
444 |
|
---|
445 | L<LWP::RobotUA>, L<WWW::RobotRules::AnyDBM_File>
|
---|