Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

RobotRules.pm@ 27181

Last change on this file since 27181 was 27181, checked in by davidb, 11 years ago
Latest libwww-perl (v6x) isn't as self-sufficeint as earlier (v5.x) in terms of supporting Perl modules. Dropping back to to this earlier version so activate.pl runs smoothly when system-installed Perl on Unix system does not have the LWP and related modules installed
File size: 10.7 KB

Line
1	package WWW::RobotRules;
2
3	$VERSION = "5.832";
4	sub Version { $VERSION; }
5
6	use strict;
7	use URI ();
8
9
10
11	sub new {
12	my($class, $ua) = @_;
13
14	# This ugly hack is needed to ensure backwards compatibility.
15	# The "WWW::RobotRules" class is now really abstract.
16	$class = "WWW::RobotRules::InCore" if $class eq "WWW::RobotRules";
17
18	my $self = bless { }, $class;
19	$self->agent($ua);
20	$self;
21	}
22
23
24	sub parse {
25	my($self, $robot_txt_uri, $txt, $fresh_until) = @_;
26	$robot_txt_uri = URI->new("$robot_txt_uri");
27	my $netloc = $robot_txt_uri->host . ":" . $robot_txt_uri->port;
28
29	$self->clear_rules($netloc);
30	$self->fresh_until($netloc, $fresh_until \|\| (time + 365243600));
31
32	my $ua;
33	my $is_me = 0; # 1 iff this record is for me
34	my $is_anon = 0; # 1 iff this record is for *
35	my $seen_disallow = 0; # watch for missing record separators
36	my @me_disallowed = (); # rules disallowed for me
37	my @anon_disallowed = (); # rules disallowed for *
38
39	# blank lines are significant, so turn CRLF into LF to avoid generating
40	# false ones
41	$txt =~ s/\015\012/\012/g;
42
43	# split at \012 (LF) or \015 (CR) (Mac text files have just CR for EOL)
44	for(split(/[\012\015]/, $txt)) {
45
46	# Lines containing only a comment are discarded completely, and
47	# therefore do not indicate a record boundary.
48	next if /^\s*\#/;
49
50	s/\s\#.//; # remove comments at end-of-line
51
52	if (/^\s*$/) { # blank line
53	last if $is_me; # That was our record. No need to read the rest.
54	$is_anon = 0;
55	$seen_disallow = 0;
56	}
57	elsif (/^\sUser-Agent\s:\s(.)/i) {
58	$ua = $1;
59	$ua =~ s/\s+$//;
60
61	if ($seen_disallow) {
62	# treat as start of a new record
63	$seen_disallow = 0;
64	last if $is_me; # That was our record. No need to read the rest.
65	$is_anon = 0;
66	}
67
68	if ($is_me) {
69	# This record already had a User-agent that
70	# we matched, so just continue.
71	}
72	elsif ($ua eq '*') {
73	$is_anon = 1;
74	}
75	elsif($self->is_me($ua)) {
76	$is_me = 1;
77	}
78	}
79	elsif (/^\sDisallow\s:\s(.)/i) {
80	unless (defined $ua) {
81	warn "RobotRules <$robot_txt_uri>: Disallow without preceding User-agent\n" if $^W;
82	$is_anon = 1; # assume that User-agent: * was intended
83	}
84	my $disallow = $1;
85	$disallow =~ s/\s+$//;
86	$seen_disallow = 1;
87	if (length $disallow) {
88	my $ignore;
89	eval {
90	my $u = URI->new_abs($disallow, $robot_txt_uri);
91	$ignore++ if $u->scheme ne $robot_txt_uri->scheme;
92	$ignore++ if lc($u->host) ne lc($robot_txt_uri->host);
93	$ignore++ if $u->port ne $robot_txt_uri->port;
94	$disallow = $u->path_query;
95	$disallow = "/" unless length $disallow;
96	};
97	next if $@;
98	next if $ignore;
99	}
100
101	if ($is_me) {
102	push(@me_disallowed, $disallow);
103	}
104	elsif ($is_anon) {
105	push(@anon_disallowed, $disallow);
106	}
107	}
108	elsif (/\S\s*:/) {
109	# ignore
110	}
111	else {
112	warn "RobotRules <$robot_txt_uri>: Malformed record: <$_>\n" if $^W;
113	}
114	}
115
116	if ($is_me) {
117	$self->push_rules($netloc, @me_disallowed);
118	}
119	else {
120	$self->push_rules($netloc, @anon_disallowed);
121	}
122	}
123
124
125	#
126	# Returns TRUE if the given name matches the
127	# name of this robot
128	#
129	sub is_me {
130	my($self, $ua_line) = @_;
131	my $me = $self->agent;
132
133	# See whether my short-name is a substring of the
134	# "User-Agent: ..." line that we were passed:
135
136	if(index(lc($me), lc($ua_line)) >= 0) {
137	return 1;
138	}
139	else {
140	return '';
141	}
142	}
143
144
145	sub allowed {
146	my($self, $uri) = @_;
147	$uri = URI->new("$uri");
148
149	return 1 unless $uri->scheme eq 'http' or $uri->scheme eq 'https';
150	# Robots.txt applies to only those schemes.
151
152	my $netloc = $uri->host . ":" . $uri->port;
153
154	my $fresh_until = $self->fresh_until($netloc);
155	return -1 if !defined($fresh_until) \|\| $fresh_until < time;
156
157	my $str = $uri->path_query;
158	my $rule;
159	for $rule ($self->rules($netloc)) {
160	return 1 unless length $rule;
161	return 0 if index($str, $rule) == 0;
162	}
163	return 1;
164	}
165
166
167	# The following methods must be provided by the subclass.
168	sub agent;
169	sub visit;
170	sub no_visits;
171	sub last_visits;
172	sub fresh_until;
173	sub push_rules;
174	sub clear_rules;
175	sub rules;
176	sub dump;
177
178
179
180	package WWW::RobotRules::InCore;
181
182	use vars qw(@ISA);
183	@ISA = qw(WWW::RobotRules);
184
185
186
187	sub agent {
188	my ($self, $name) = @_;
189	my $old = $self->{'ua'};
190	if ($name) {
191	# Strip it so that it's just the short name.
192	# I.e., "FooBot" => "FooBot"
193	# "FooBot/1.2" => "FooBot"
194	# "FooBot/1.2 [http://foobot.int; [email protected]]" => "FooBot"
195
196	$name = $1 if $name =~ m/(\S+)/; # get first word
197	$name =~ s!/.*!!; # get rid of version
198	unless ($old && $old eq $name) {
199	delete $self->{'loc'}; # all old info is now stale
200	$self->{'ua'} = $name;
201	}
202	}
203	$old;
204	}
205
206
207	sub visit {
208	my($self, $netloc, $time) = @_;
209	return unless $netloc;
210	$time \|\|= time;
211	$self->{'loc'}{$netloc}{'last'} = $time;
212	my $count = \$self->{'loc'}{$netloc}{'count'};
213	if (!defined $$count) {
214	$$count = 1;
215	}
216	else {
217	$$count++;
218	}
219	}
220
221
222	sub no_visits {
223	my ($self, $netloc) = @_;
224	$self->{'loc'}{$netloc}{'count'};
225	}
226
227
228	sub last_visit {
229	my ($self, $netloc) = @_;
230	$self->{'loc'}{$netloc}{'last'};
231	}
232
233
234	sub fresh_until {
235	my ($self, $netloc, $fresh_until) = @_;
236	my $old = $self->{'loc'}{$netloc}{'fresh'};
237	if (defined $fresh_until) {
238	$self->{'loc'}{$netloc}{'fresh'} = $fresh_until;
239	}
240	$old;
241	}
242
243
244	sub push_rules {
245	my($self, $netloc, @rules) = @_;
246	push (@{$self->{'loc'}{$netloc}{'rules'}}, @rules);
247	}
248
249
250	sub clear_rules {
251	my($self, $netloc) = @_;
252	delete $self->{'loc'}{$netloc}{'rules'};
253	}
254
255
256	sub rules {
257	my($self, $netloc) = @_;
258	if (defined $self->{'loc'}{$netloc}{'rules'}) {
259	return @{$self->{'loc'}{$netloc}{'rules'}};
260	}
261	else {
262	return ();
263	}
264	}
265
266
267	sub dump
268	{
269	my $self = shift;
270	for (keys %$self) {
271	next if $_ eq 'loc';
272	print "$_ = $self->{$_}\n";
273	}
274	for (keys %{$self->{'loc'}}) {
275	my @rules = $self->rules($_);
276	print "$_: ", join("; ", @rules), "\n";
277	}
278	}
279
280
281	1;
282
283	__END__
284
285
286	# Bender: "Well, I don't have anything else
287	# planned for today. Let's get drunk!"
288
289	=head1 NAME
290
291	WWW::RobotRules - database of robots.txt-derived permissions
292
293	=head1 SYNOPSIS
294
295	use WWW::RobotRules;
296	my $rules = WWW::RobotRules->new('MOMspider/1.0');
297
298	use LWP::Simple qw(get);
299
300	{
301	my $url = "http://some.place/robots.txt";
302	my $robots_txt = get $url;
303	$rules->parse($url, $robots_txt) if defined $robots_txt;
304	}
305
306	{
307	my $url = "http://some.other.place/robots.txt";
308	my $robots_txt = get $url;
309	$rules->parse($url, $robots_txt) if defined $robots_txt;
310	}
311
312	# Now we can check if a URL is valid for those servers
313	# whose "robots.txt" files we've gotten and parsed:
314	if($rules->allowed($url)) {
315	$c = get $url;
316	...
317	}
318
319	=head1 DESCRIPTION
320
321	This module parses F</robots.txt> files as specified in
322	"A Standard for Robot Exclusion", at
323	<http://www.robotstxt.org/wc/norobots.html>
324	Webmasters can use the F</robots.txt> file to forbid conforming
325	robots from accessing parts of their web site.
326
327	The parsed files are kept in a WWW::RobotRules object, and this object
328	provides methods to check if access to a given URL is prohibited. The
329	same WWW::RobotRules object can be used for one or more parsed
330	F</robots.txt> files on any number of hosts.
331
332	The following methods are provided:
333
334	=over 4
335
336	=item $rules = WWW::RobotRules->new($robot_name)
337
338	This is the constructor for WWW::RobotRules objects. The first
339	argument given to new() is the name of the robot.
340
341	=item $rules->parse($robot_txt_url, $content, $fresh_until)
342
343	The parse() method takes as arguments the URL that was used to
344	retrieve the F</robots.txt> file, and the contents of the file.
345
346	=item $rules->allowed($uri)
347
348	Returns TRUE if this robot is allowed to retrieve this URL.
349
350	=item $rules->agent([$name])
351
352	Get/set the agent name. NOTE: Changing the agent name will clear the robots.txt
353	rules and expire times out of the cache.
354
355	=back
356
357	=head1 ROBOTS.TXT
358
359	The format and semantics of the "/robots.txt" file are as follows
360	(this is an edited abstract of
361	<http://www.robotstxt.org/wc/norobots.html>):
362
363	The file consists of one or more records separated by one or more
364	blank lines. Each record contains lines of the form
365
366	<field-name>: <value>
367
368	The field name is case insensitive. Text after the '#' character on a
369	line is ignored during parsing. This is used for comments. The
370	following <field-names> can be used:
371
372	=over 3
373
374	=item User-Agent
375
376	The value of this field is the name of the robot the record is
377	describing access policy for. If more than one I<User-Agent> field is
378	present the record describes an identical access policy for more than
379	one robot. At least one field needs to be present per record. If the
380	value is '*', the record describes the default access policy for any
381	robot that has not not matched any of the other records.
382
383	The I<User-Agent> fields must occur before the I<Disallow> fields. If a
384	record contains a I<User-Agent> field after a I<Disallow> field, that
385	constitutes a malformed record. This parser will assume that a blank
386	line should have been placed before that I<User-Agent> field, and will
387	break the record into two. All the fields before the I<User-Agent> field
388	will constitute a record, and the I<User-Agent> field will be the first
389	field in a new record.
390
391	=item Disallow
392
393	The value of this field specifies a partial URL that is not to be
394	visited. This can be a full path, or a partial path; any URL that
395	starts with this value will not be retrieved
396
397	=back
398
399	Unrecognized records are ignored.
400
401	=head1 ROBOTS.TXT EXAMPLES
402
403	The following example "/robots.txt" file specifies that no robots
404	should visit any URL starting with "/cyberworld/map/" or "/tmp/":
405
406	User-agent: *
407	Disallow: /cyberworld/map/ # This is an infinite virtual URL space
408	Disallow: /tmp/ # these will soon disappear
409
410	This example "/robots.txt" file specifies that no robots should visit
411	any URL starting with "/cyberworld/map/", except the robot called
412	"cybermapper":
413
414	User-agent: *
415	Disallow: /cyberworld/map/ # This is an infinite virtual URL space
416
417	# Cybermapper knows where to go.
418	User-agent: cybermapper
419	Disallow:
420
421	This example indicates that no robots should visit this site further:
422
423	# go away
424	User-agent: *
425	Disallow: /
426
427	This is an example of a malformed robots.txt file.
428
429	# robots.txt for ancientcastle.example.com
430	# I've locked myself away.
431	User-agent: *
432	Disallow: /
433	# The castle is your home now, so you can go anywhere you like.
434	User-agent: Belle
435	Disallow: /west-wing/ # except the west wing!
436	# It's good to be the Prince...
437	User-agent: Beast
438	Disallow:
439
440	This file is missing the required blank lines between records.
441	However, the intention is clear.
442
443	=head1 SEE ALSO
444
445	L<LWP::RobotUA>, L<WWW::RobotRules::AnyDBM_File>

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/cpan/WWW/RobotRules.pm@ 27181

Download in other formats: