source: main/trunk/greenstone2/perllib/cpan/LWP/RobotUA.pm@ 27174

Last change on this file since 27174 was 27174, checked in by davidb, 11 years ago

Perl modules from CPAN that are used in supporting activate.pl, but not part of the Perl core. Only PMs included.

File size: 7.5 KB
Line 
1package LWP::RobotUA;
2
3require LWP::UserAgent;
4@ISA = qw(LWP::UserAgent);
5$VERSION = "6.03";
6
7require WWW::RobotRules;
8require HTTP::Request;
9require HTTP::Response;
10
11use Carp ();
12use HTTP::Status ();
13use HTTP::Date qw(time2str);
14use strict;
15
16
17#
18# Additional attributes in addition to those found in LWP::UserAgent:
19#
20# $self->{'delay'} Required delay between request to the same
21# server in minutes.
22#
23# $self->{'rules'} A WWW::RobotRules object
24#
25
26sub new
27{
28 my $class = shift;
29 my %cnf;
30 if (@_ < 4) {
31 # legacy args
32 @cnf{qw(agent from rules)} = @_;
33 }
34 else {
35 %cnf = @_;
36 }
37
38 Carp::croak('LWP::RobotUA agent required') unless $cnf{agent};
39 Carp::croak('LWP::RobotUA from address required')
40 unless $cnf{from} && $cnf{from} =~ m/\@/;
41
42 my $delay = delete $cnf{delay} || 1;
43 my $use_sleep = delete $cnf{use_sleep};
44 $use_sleep = 1 unless defined($use_sleep);
45 my $rules = delete $cnf{rules};
46
47 my $self = LWP::UserAgent->new(%cnf);
48 $self = bless $self, $class;
49
50 $self->{'delay'} = $delay; # minutes
51 $self->{'use_sleep'} = $use_sleep;
52
53 if ($rules) {
54 $rules->agent($cnf{agent});
55 $self->{'rules'} = $rules;
56 }
57 else {
58 $self->{'rules'} = WWW::RobotRules->new($cnf{agent});
59 }
60
61 $self;
62}
63
64
65sub delay { shift->_elem('delay', @_); }
66sub use_sleep { shift->_elem('use_sleep', @_); }
67
68
69sub agent
70{
71 my $self = shift;
72 my $old = $self->SUPER::agent(@_);
73 if (@_) {
74 # Changing our name means to start fresh
75 $self->{'rules'}->agent($self->{'agent'});
76 }
77 $old;
78}
79
80
81sub rules {
82 my $self = shift;
83 my $old = $self->_elem('rules', @_);
84 $self->{'rules'}->agent($self->{'agent'}) if @_;
85 $old;
86}
87
88
89sub no_visits
90{
91 my($self, $netloc) = @_;
92 $self->{'rules'}->no_visits($netloc) || 0;
93}
94
95*host_count = \&no_visits; # backwards compatibility with LWP-5.02
96
97
98sub host_wait
99{
100 my($self, $netloc) = @_;
101 return undef unless defined $netloc;
102 my $last = $self->{'rules'}->last_visit($netloc);
103 if ($last) {
104 my $wait = int($self->{'delay'} * 60 - (time - $last));
105 $wait = 0 if $wait < 0;
106 return $wait;
107 }
108 return 0;
109}
110
111
112sub simple_request
113{
114 my($self, $request, $arg, $size) = @_;
115
116 # Do we try to access a new server?
117 my $allowed = $self->{'rules'}->allowed($request->uri);
118
119 if ($allowed < 0) {
120 # Host is not visited before, or robots.txt expired; fetch "robots.txt"
121 my $robot_url = $request->uri->clone;
122 $robot_url->path("robots.txt");
123 $robot_url->query(undef);
124
125 # make access to robot.txt legal since this will be a recursive call
126 $self->{'rules'}->parse($robot_url, "");
127
128 my $robot_req = HTTP::Request->new('GET', $robot_url);
129 my $parse_head = $self->parse_head(0);
130 my $robot_res = $self->request($robot_req);
131 $self->parse_head($parse_head);
132 my $fresh_until = $robot_res->fresh_until;
133 my $content = "";
134 if ($robot_res->is_success && $robot_res->content_is_text) {
135 $content = $robot_res->decoded_content;
136 $content = "" unless $content && $content =~ /^\s*Disallow\s*:/mi;
137 }
138 $self->{'rules'}->parse($robot_url, $content, $fresh_until);
139
140 # recalculate allowed...
141 $allowed = $self->{'rules'}->allowed($request->uri);
142 }
143
144 # Check rules
145 unless ($allowed) {
146 my $res = HTTP::Response->new(
147 &HTTP::Status::RC_FORBIDDEN, 'Forbidden by robots.txt');
148 $res->request( $request ); # bind it to that request
149 return $res;
150 }
151
152 my $netloc = eval { local $SIG{__DIE__}; $request->uri->host_port; };
153 my $wait = $self->host_wait($netloc);
154
155 if ($wait) {
156 if ($self->{'use_sleep'}) {
157 sleep($wait)
158 }
159 else {
160 my $res = HTTP::Response->new(
161 &HTTP::Status::RC_SERVICE_UNAVAILABLE, 'Please, slow down');
162 $res->header('Retry-After', time2str(time + $wait));
163 $res->request( $request ); # bind it to that request
164 return $res;
165 }
166 }
167
168 # Perform the request
169 my $res = $self->SUPER::simple_request($request, $arg, $size);
170
171 $self->{'rules'}->visit($netloc);
172
173 $res;
174}
175
176
177sub as_string
178{
179 my $self = shift;
180 my @s;
181 push(@s, "Robot: $self->{'agent'} operated by $self->{'from'} [$self]");
182 push(@s, " Minimum delay: " . int($self->{'delay'}*60) . "s");
183 push(@s, " Will sleep if too early") if $self->{'use_sleep'};
184 push(@s, " Rules = $self->{'rules'}");
185 join("\n", @s, '');
186}
187
1881;
189
190
191__END__
192
193=head1 NAME
194
195LWP::RobotUA - a class for well-behaved Web robots
196
197=head1 SYNOPSIS
198
199 use LWP::RobotUA;
200 my $ua = LWP::RobotUA->new('my-robot/0.1', '[email protected]');
201 $ua->delay(10); # be very nice -- max one hit every ten minutes!
202 ...
203
204 # Then just use it just like a normal LWP::UserAgent:
205 my $response = $ua->get('http://whatever.int/...');
206 ...
207
208=head1 DESCRIPTION
209
210This class implements a user agent that is suitable for robot
211applications. Robots should be nice to the servers they visit. They
212should consult the F</robots.txt> file to ensure that they are welcomed
213and they should not make requests too frequently.
214
215But before you consider writing a robot, take a look at
216<URL:http://www.robotstxt.org/>.
217
218When you use a I<LWP::RobotUA> object as your user agent, then you do not
219really have to think about these things yourself; C<robots.txt> files
220are automatically consulted and obeyed, the server isn't queried
221too rapidly, and so on. Just send requests
222as you do when you are using a normal I<LWP::UserAgent>
223object (using C<< $ua->get(...) >>, C<< $ua->head(...) >>,
224C<< $ua->request(...) >>, etc.), and this
225special agent will make sure you are nice.
226
227=head1 METHODS
228
229The LWP::RobotUA is a sub-class of LWP::UserAgent and implements the
230same methods. In addition the following methods are provided:
231
232=over 4
233
234=item $ua = LWP::RobotUA->new( %options )
235
236=item $ua = LWP::RobotUA->new( $agent, $from )
237
238=item $ua = LWP::RobotUA->new( $agent, $from, $rules )
239
240The LWP::UserAgent options C<agent> and C<from> are mandatory. The
241options C<delay>, C<use_sleep> and C<rules> initialize attributes
242private to the RobotUA. If C<rules> are not provided, then
243C<WWW::RobotRules> is instantiated providing an internal database of
244F<robots.txt>.
245
246It is also possible to just pass the value of C<agent>, C<from> and
247optionally C<rules> as plain positional arguments.
248
249=item $ua->delay
250
251=item $ua->delay( $minutes )
252
253Get/set the minimum delay between requests to the same server, in
254I<minutes>. The default is 1 minute. Note that this number doesn't
255have to be an integer; for example, this sets the delay to 10 seconds:
256
257 $ua->delay(10/60);
258
259=item $ua->use_sleep
260
261=item $ua->use_sleep( $boolean )
262
263Get/set a value indicating whether the UA should sleep() if requests
264arrive too fast, defined as $ua->delay minutes not passed since
265last request to the given server. The default is TRUE. If this value is
266FALSE then an internal SERVICE_UNAVAILABLE response will be generated.
267It will have an Retry-After header that indicates when it is OK to
268send another request to this server.
269
270=item $ua->rules
271
272=item $ua->rules( $rules )
273
274Set/get which I<WWW::RobotRules> object to use.
275
276=item $ua->no_visits( $netloc )
277
278Returns the number of documents fetched from this server host. Yeah I
279know, this method should probably have been named num_visits() or
280something like that. :-(
281
282=item $ua->host_wait( $netloc )
283
284Returns the number of I<seconds> (from now) you must wait before you can
285make a new request to this host.
286
287=item $ua->as_string
288
289Returns a string that describes the state of the UA.
290Mainly useful for debugging.
291
292=back
293
294=head1 SEE ALSO
295
296L<LWP::UserAgent>, L<WWW::RobotRules>
297
298=head1 COPYRIGHT
299
300Copyright 1996-2004 Gisle Aas.
301
302This library is free software; you can redistribute it and/or
303modify it under the same terms as Perl itself.
Note: See TracBrowser for help on using the repository browser.