source: main/trunk/greenstone2/perllib/cpan/WWW/RobotRules.pm@ 27181

Last change on this file since 27181 was 27181, checked in by davidb, 11 years ago

Latest libwww-perl (v6x) isn't as self-sufficeint as earlier (v5.x) in terms of supporting Perl modules. Dropping back to to this earlier version so activate.pl runs smoothly when system-installed Perl on Unix system does not have the LWP and related modules installed

File size: 10.7 KB
Line 
1package WWW::RobotRules;
2
3$VERSION = "5.832";
4sub Version { $VERSION; }
5
6use strict;
7use URI ();
8
9
10
11sub new {
12 my($class, $ua) = @_;
13
14 # This ugly hack is needed to ensure backwards compatibility.
15 # The "WWW::RobotRules" class is now really abstract.
16 $class = "WWW::RobotRules::InCore" if $class eq "WWW::RobotRules";
17
18 my $self = bless { }, $class;
19 $self->agent($ua);
20 $self;
21}
22
23
24sub parse {
25 my($self, $robot_txt_uri, $txt, $fresh_until) = @_;
26 $robot_txt_uri = URI->new("$robot_txt_uri");
27 my $netloc = $robot_txt_uri->host . ":" . $robot_txt_uri->port;
28
29 $self->clear_rules($netloc);
30 $self->fresh_until($netloc, $fresh_until || (time + 365*24*3600));
31
32 my $ua;
33 my $is_me = 0; # 1 iff this record is for me
34 my $is_anon = 0; # 1 iff this record is for *
35 my $seen_disallow = 0; # watch for missing record separators
36 my @me_disallowed = (); # rules disallowed for me
37 my @anon_disallowed = (); # rules disallowed for *
38
39 # blank lines are significant, so turn CRLF into LF to avoid generating
40 # false ones
41 $txt =~ s/\015\012/\012/g;
42
43 # split at \012 (LF) or \015 (CR) (Mac text files have just CR for EOL)
44 for(split(/[\012\015]/, $txt)) {
45
46 # Lines containing only a comment are discarded completely, and
47 # therefore do not indicate a record boundary.
48 next if /^\s*\#/;
49
50 s/\s*\#.*//; # remove comments at end-of-line
51
52 if (/^\s*$/) { # blank line
53 last if $is_me; # That was our record. No need to read the rest.
54 $is_anon = 0;
55 $seen_disallow = 0;
56 }
57 elsif (/^\s*User-Agent\s*:\s*(.*)/i) {
58 $ua = $1;
59 $ua =~ s/\s+$//;
60
61 if ($seen_disallow) {
62 # treat as start of a new record
63 $seen_disallow = 0;
64 last if $is_me; # That was our record. No need to read the rest.
65 $is_anon = 0;
66 }
67
68 if ($is_me) {
69 # This record already had a User-agent that
70 # we matched, so just continue.
71 }
72 elsif ($ua eq '*') {
73 $is_anon = 1;
74 }
75 elsif($self->is_me($ua)) {
76 $is_me = 1;
77 }
78 }
79 elsif (/^\s*Disallow\s*:\s*(.*)/i) {
80 unless (defined $ua) {
81 warn "RobotRules <$robot_txt_uri>: Disallow without preceding User-agent\n" if $^W;
82 $is_anon = 1; # assume that User-agent: * was intended
83 }
84 my $disallow = $1;
85 $disallow =~ s/\s+$//;
86 $seen_disallow = 1;
87 if (length $disallow) {
88 my $ignore;
89 eval {
90 my $u = URI->new_abs($disallow, $robot_txt_uri);
91 $ignore++ if $u->scheme ne $robot_txt_uri->scheme;
92 $ignore++ if lc($u->host) ne lc($robot_txt_uri->host);
93 $ignore++ if $u->port ne $robot_txt_uri->port;
94 $disallow = $u->path_query;
95 $disallow = "/" unless length $disallow;
96 };
97 next if $@;
98 next if $ignore;
99 }
100
101 if ($is_me) {
102 push(@me_disallowed, $disallow);
103 }
104 elsif ($is_anon) {
105 push(@anon_disallowed, $disallow);
106 }
107 }
108 elsif (/\S\s*:/) {
109 # ignore
110 }
111 else {
112 warn "RobotRules <$robot_txt_uri>: Malformed record: <$_>\n" if $^W;
113 }
114 }
115
116 if ($is_me) {
117 $self->push_rules($netloc, @me_disallowed);
118 }
119 else {
120 $self->push_rules($netloc, @anon_disallowed);
121 }
122}
123
124
125#
126# Returns TRUE if the given name matches the
127# name of this robot
128#
129sub is_me {
130 my($self, $ua_line) = @_;
131 my $me = $self->agent;
132
133 # See whether my short-name is a substring of the
134 # "User-Agent: ..." line that we were passed:
135
136 if(index(lc($me), lc($ua_line)) >= 0) {
137 return 1;
138 }
139 else {
140 return '';
141 }
142}
143
144
145sub allowed {
146 my($self, $uri) = @_;
147 $uri = URI->new("$uri");
148
149 return 1 unless $uri->scheme eq 'http' or $uri->scheme eq 'https';
150 # Robots.txt applies to only those schemes.
151
152 my $netloc = $uri->host . ":" . $uri->port;
153
154 my $fresh_until = $self->fresh_until($netloc);
155 return -1 if !defined($fresh_until) || $fresh_until < time;
156
157 my $str = $uri->path_query;
158 my $rule;
159 for $rule ($self->rules($netloc)) {
160 return 1 unless length $rule;
161 return 0 if index($str, $rule) == 0;
162 }
163 return 1;
164}
165
166
167# The following methods must be provided by the subclass.
168sub agent;
169sub visit;
170sub no_visits;
171sub last_visits;
172sub fresh_until;
173sub push_rules;
174sub clear_rules;
175sub rules;
176sub dump;
177
178
179
180package WWW::RobotRules::InCore;
181
182use vars qw(@ISA);
183@ISA = qw(WWW::RobotRules);
184
185
186
187sub agent {
188 my ($self, $name) = @_;
189 my $old = $self->{'ua'};
190 if ($name) {
191 # Strip it so that it's just the short name.
192 # I.e., "FooBot" => "FooBot"
193 # "FooBot/1.2" => "FooBot"
194 # "FooBot/1.2 [http://foobot.int; [email protected]]" => "FooBot"
195
196 $name = $1 if $name =~ m/(\S+)/; # get first word
197 $name =~ s!/.*!!; # get rid of version
198 unless ($old && $old eq $name) {
199 delete $self->{'loc'}; # all old info is now stale
200 $self->{'ua'} = $name;
201 }
202 }
203 $old;
204}
205
206
207sub visit {
208 my($self, $netloc, $time) = @_;
209 return unless $netloc;
210 $time ||= time;
211 $self->{'loc'}{$netloc}{'last'} = $time;
212 my $count = \$self->{'loc'}{$netloc}{'count'};
213 if (!defined $$count) {
214 $$count = 1;
215 }
216 else {
217 $$count++;
218 }
219}
220
221
222sub no_visits {
223 my ($self, $netloc) = @_;
224 $self->{'loc'}{$netloc}{'count'};
225}
226
227
228sub last_visit {
229 my ($self, $netloc) = @_;
230 $self->{'loc'}{$netloc}{'last'};
231}
232
233
234sub fresh_until {
235 my ($self, $netloc, $fresh_until) = @_;
236 my $old = $self->{'loc'}{$netloc}{'fresh'};
237 if (defined $fresh_until) {
238 $self->{'loc'}{$netloc}{'fresh'} = $fresh_until;
239 }
240 $old;
241}
242
243
244sub push_rules {
245 my($self, $netloc, @rules) = @_;
246 push (@{$self->{'loc'}{$netloc}{'rules'}}, @rules);
247}
248
249
250sub clear_rules {
251 my($self, $netloc) = @_;
252 delete $self->{'loc'}{$netloc}{'rules'};
253}
254
255
256sub rules {
257 my($self, $netloc) = @_;
258 if (defined $self->{'loc'}{$netloc}{'rules'}) {
259 return @{$self->{'loc'}{$netloc}{'rules'}};
260 }
261 else {
262 return ();
263 }
264}
265
266
267sub dump
268{
269 my $self = shift;
270 for (keys %$self) {
271 next if $_ eq 'loc';
272 print "$_ = $self->{$_}\n";
273 }
274 for (keys %{$self->{'loc'}}) {
275 my @rules = $self->rules($_);
276 print "$_: ", join("; ", @rules), "\n";
277 }
278}
279
280
2811;
282
283__END__
284
285
286# Bender: "Well, I don't have anything else
287# planned for today. Let's get drunk!"
288
289=head1 NAME
290
291WWW::RobotRules - database of robots.txt-derived permissions
292
293=head1 SYNOPSIS
294
295 use WWW::RobotRules;
296 my $rules = WWW::RobotRules->new('MOMspider/1.0');
297
298 use LWP::Simple qw(get);
299
300 {
301 my $url = "http://some.place/robots.txt";
302 my $robots_txt = get $url;
303 $rules->parse($url, $robots_txt) if defined $robots_txt;
304 }
305
306 {
307 my $url = "http://some.other.place/robots.txt";
308 my $robots_txt = get $url;
309 $rules->parse($url, $robots_txt) if defined $robots_txt;
310 }
311
312 # Now we can check if a URL is valid for those servers
313 # whose "robots.txt" files we've gotten and parsed:
314 if($rules->allowed($url)) {
315 $c = get $url;
316 ...
317 }
318
319=head1 DESCRIPTION
320
321This module parses F</robots.txt> files as specified in
322"A Standard for Robot Exclusion", at
323<http://www.robotstxt.org/wc/norobots.html>
324Webmasters can use the F</robots.txt> file to forbid conforming
325robots from accessing parts of their web site.
326
327The parsed files are kept in a WWW::RobotRules object, and this object
328provides methods to check if access to a given URL is prohibited. The
329same WWW::RobotRules object can be used for one or more parsed
330F</robots.txt> files on any number of hosts.
331
332The following methods are provided:
333
334=over 4
335
336=item $rules = WWW::RobotRules->new($robot_name)
337
338This is the constructor for WWW::RobotRules objects. The first
339argument given to new() is the name of the robot.
340
341=item $rules->parse($robot_txt_url, $content, $fresh_until)
342
343The parse() method takes as arguments the URL that was used to
344retrieve the F</robots.txt> file, and the contents of the file.
345
346=item $rules->allowed($uri)
347
348Returns TRUE if this robot is allowed to retrieve this URL.
349
350=item $rules->agent([$name])
351
352Get/set the agent name. NOTE: Changing the agent name will clear the robots.txt
353rules and expire times out of the cache.
354
355=back
356
357=head1 ROBOTS.TXT
358
359The format and semantics of the "/robots.txt" file are as follows
360(this is an edited abstract of
361<http://www.robotstxt.org/wc/norobots.html>):
362
363The file consists of one or more records separated by one or more
364blank lines. Each record contains lines of the form
365
366 <field-name>: <value>
367
368The field name is case insensitive. Text after the '#' character on a
369line is ignored during parsing. This is used for comments. The
370following <field-names> can be used:
371
372=over 3
373
374=item User-Agent
375
376The value of this field is the name of the robot the record is
377describing access policy for. If more than one I<User-Agent> field is
378present the record describes an identical access policy for more than
379one robot. At least one field needs to be present per record. If the
380value is '*', the record describes the default access policy for any
381robot that has not not matched any of the other records.
382
383The I<User-Agent> fields must occur before the I<Disallow> fields. If a
384record contains a I<User-Agent> field after a I<Disallow> field, that
385constitutes a malformed record. This parser will assume that a blank
386line should have been placed before that I<User-Agent> field, and will
387break the record into two. All the fields before the I<User-Agent> field
388will constitute a record, and the I<User-Agent> field will be the first
389field in a new record.
390
391=item Disallow
392
393The value of this field specifies a partial URL that is not to be
394visited. This can be a full path, or a partial path; any URL that
395starts with this value will not be retrieved
396
397=back
398
399Unrecognized records are ignored.
400
401=head1 ROBOTS.TXT EXAMPLES
402
403The following example "/robots.txt" file specifies that no robots
404should visit any URL starting with "/cyberworld/map/" or "/tmp/":
405
406 User-agent: *
407 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
408 Disallow: /tmp/ # these will soon disappear
409
410This example "/robots.txt" file specifies that no robots should visit
411any URL starting with "/cyberworld/map/", except the robot called
412"cybermapper":
413
414 User-agent: *
415 Disallow: /cyberworld/map/ # This is an infinite virtual URL space
416
417 # Cybermapper knows where to go.
418 User-agent: cybermapper
419 Disallow:
420
421This example indicates that no robots should visit this site further:
422
423 # go away
424 User-agent: *
425 Disallow: /
426
427This is an example of a malformed robots.txt file.
428
429 # robots.txt for ancientcastle.example.com
430 # I've locked myself away.
431 User-agent: *
432 Disallow: /
433 # The castle is your home now, so you can go anywhere you like.
434 User-agent: Belle
435 Disallow: /west-wing/ # except the west wing!
436 # It's good to be the Prince...
437 User-agent: Beast
438 Disallow:
439
440This file is missing the required blank lines between records.
441However, the intention is clear.
442
443=head1 SEE ALSO
444
445L<LWP::RobotUA>, L<WWW::RobotRules::AnyDBM_File>
Note: See TracBrowser for help on using the repository browser.