1 | package WWW::RobotRules::AnyDBM_File;
|
---|
2 |
|
---|
3 | require WWW::RobotRules;
|
---|
4 | @ISA = qw(WWW::RobotRules);
|
---|
5 | $VERSION = "5.835";
|
---|
6 |
|
---|
7 | use Carp ();
|
---|
8 | use AnyDBM_File;
|
---|
9 | use Fcntl;
|
---|
10 | use strict;
|
---|
11 |
|
---|
12 | =head1 NAME
|
---|
13 |
|
---|
14 | WWW::RobotRules::AnyDBM_File - Persistent RobotRules
|
---|
15 |
|
---|
16 | =head1 SYNOPSIS
|
---|
17 |
|
---|
18 | require WWW::RobotRules::AnyDBM_File;
|
---|
19 | require LWP::RobotUA;
|
---|
20 |
|
---|
21 | # Create a robot useragent that uses a diskcaching RobotRules
|
---|
22 | my $rules = WWW::RobotRules::AnyDBM_File->new( 'my-robot/1.0', 'cachefile' );
|
---|
23 | my $ua = WWW::RobotUA->new( 'my-robot/1.0', '[email protected]', $rules );
|
---|
24 |
|
---|
25 | # Then just use $ua as usual
|
---|
26 | $res = $ua->request($req);
|
---|
27 |
|
---|
28 | =head1 DESCRIPTION
|
---|
29 |
|
---|
30 | This is a subclass of I<WWW::RobotRules> that uses the AnyDBM_File
|
---|
31 | package to implement persistent diskcaching of F<robots.txt> and host
|
---|
32 | visit information.
|
---|
33 |
|
---|
34 | The constructor (the new() method) takes an extra argument specifying
|
---|
35 | the name of the DBM file to use. If the DBM file already exists, then
|
---|
36 | you can specify undef as agent name as the name can be obtained from
|
---|
37 | the DBM database.
|
---|
38 |
|
---|
39 | =cut
|
---|
40 |
|
---|
41 | sub new
|
---|
42 | {
|
---|
43 | my ($class, $ua, $file) = @_;
|
---|
44 | Carp::croak('WWW::RobotRules::AnyDBM_File filename required') unless $file;
|
---|
45 |
|
---|
46 | my $self = bless { }, $class;
|
---|
47 | $self->{'filename'} = $file;
|
---|
48 | tie %{$self->{'dbm'}}, 'AnyDBM_File', $file, O_CREAT|O_RDWR, 0640
|
---|
49 | or Carp::croak("Can't open $file: $!");
|
---|
50 |
|
---|
51 | if ($ua) {
|
---|
52 | $self->agent($ua);
|
---|
53 | }
|
---|
54 | else {
|
---|
55 | # Try to obtain name from DBM file
|
---|
56 | $ua = $self->{'dbm'}{"|ua-name|"};
|
---|
57 | Carp::croak("No agent name specified") unless $ua;
|
---|
58 | }
|
---|
59 |
|
---|
60 | $self;
|
---|
61 | }
|
---|
62 |
|
---|
63 | sub agent {
|
---|
64 | my($self, $newname) = @_;
|
---|
65 | my $old = $self->{'dbm'}{"|ua-name|"};
|
---|
66 | if (defined $newname) {
|
---|
67 | $newname =~ s!/?\s*\d+.\d+\s*$!!; # loose version
|
---|
68 | unless ($old && $old eq $newname) {
|
---|
69 | # Old info is now stale.
|
---|
70 | my $file = $self->{'filename'};
|
---|
71 | untie %{$self->{'dbm'}};
|
---|
72 | tie %{$self->{'dbm'}}, 'AnyDBM_File', $file, O_TRUNC|O_RDWR, 0640;
|
---|
73 | %{$self->{'dbm'}} = ();
|
---|
74 | $self->{'dbm'}{"|ua-name|"} = $newname;
|
---|
75 | }
|
---|
76 | }
|
---|
77 | $old;
|
---|
78 | }
|
---|
79 |
|
---|
80 | sub no_visits {
|
---|
81 | my ($self, $netloc) = @_;
|
---|
82 | my $t = $self->{'dbm'}{"$netloc|vis"};
|
---|
83 | return 0 unless $t;
|
---|
84 | (split(/;\s*/, $t))[0];
|
---|
85 | }
|
---|
86 |
|
---|
87 | sub last_visit {
|
---|
88 | my ($self, $netloc) = @_;
|
---|
89 | my $t = $self->{'dbm'}{"$netloc|vis"};
|
---|
90 | return undef unless $t;
|
---|
91 | (split(/;\s*/, $t))[1];
|
---|
92 | }
|
---|
93 |
|
---|
94 | sub fresh_until {
|
---|
95 | my ($self, $netloc, $fresh) = @_;
|
---|
96 | my $old = $self->{'dbm'}{"$netloc|exp"};
|
---|
97 | if ($old) {
|
---|
98 | $old =~ s/;.*//; # remove cleartext
|
---|
99 | }
|
---|
100 | if (defined $fresh) {
|
---|
101 | $fresh .= "; " . localtime($fresh);
|
---|
102 | $self->{'dbm'}{"$netloc|exp"} = $fresh;
|
---|
103 | }
|
---|
104 | $old;
|
---|
105 | }
|
---|
106 |
|
---|
107 | sub visit {
|
---|
108 | my($self, $netloc, $time) = @_;
|
---|
109 | $time ||= time;
|
---|
110 |
|
---|
111 | my $count = 0;
|
---|
112 | my $old = $self->{'dbm'}{"$netloc|vis"};
|
---|
113 | if ($old) {
|
---|
114 | my $last;
|
---|
115 | ($count,$last) = split(/;\s*/, $old);
|
---|
116 | $time = $last if $last > $time;
|
---|
117 | }
|
---|
118 | $count++;
|
---|
119 | $self->{'dbm'}{"$netloc|vis"} = "$count; $time; " . localtime($time);
|
---|
120 | }
|
---|
121 |
|
---|
122 | sub push_rules {
|
---|
123 | my($self, $netloc, @rules) = @_;
|
---|
124 | my $cnt = 1;
|
---|
125 | $cnt++ while $self->{'dbm'}{"$netloc|r$cnt"};
|
---|
126 |
|
---|
127 | foreach (@rules) {
|
---|
128 | $self->{'dbm'}{"$netloc|r$cnt"} = $_;
|
---|
129 | $cnt++;
|
---|
130 | }
|
---|
131 | }
|
---|
132 |
|
---|
133 | sub clear_rules {
|
---|
134 | my($self, $netloc) = @_;
|
---|
135 | my $cnt = 1;
|
---|
136 | while ($self->{'dbm'}{"$netloc|r$cnt"}) {
|
---|
137 | delete $self->{'dbm'}{"$netloc|r$cnt"};
|
---|
138 | $cnt++;
|
---|
139 | }
|
---|
140 | }
|
---|
141 |
|
---|
142 | sub rules {
|
---|
143 | my($self, $netloc) = @_;
|
---|
144 | my @rules = ();
|
---|
145 | my $cnt = 1;
|
---|
146 | while (1) {
|
---|
147 | my $rule = $self->{'dbm'}{"$netloc|r$cnt"};
|
---|
148 | last unless $rule;
|
---|
149 | push(@rules, $rule);
|
---|
150 | $cnt++;
|
---|
151 | }
|
---|
152 | @rules;
|
---|
153 | }
|
---|
154 |
|
---|
155 | sub dump
|
---|
156 | {
|
---|
157 | }
|
---|
158 |
|
---|
159 | 1;
|
---|
160 |
|
---|
161 | =head1 SEE ALSO
|
---|
162 |
|
---|
163 | L<WWW::RobotRules>, L<LWP::RobotUA>
|
---|
164 |
|
---|
165 | =head1 AUTHORS
|
---|
166 |
|
---|
167 | Hakan Ardo E<lt>[email protected]>, Gisle Aas E<lt>[email protected]>
|
---|
168 |
|
---|
169 | =cut
|
---|
170 |
|
---|