1 | package HTML::LinkExtor;
|
---|
2 |
|
---|
3 | # $Id: LinkExtor.pm 14078 2007-05-17 03:15:41Z lh92 $
|
---|
4 |
|
---|
5 | require HTML::Parser;
|
---|
6 | @ISA = qw(HTML::Parser);
|
---|
7 | $VERSION = sprintf("%d.%02d", q$Revision: 14078 $ =~ /(\d+)\.(\d+)/);
|
---|
8 |
|
---|
9 | =head1 NAME
|
---|
10 |
|
---|
11 | HTML::LinkExtor - Extract links from an HTML document
|
---|
12 |
|
---|
13 | =head1 SYNOPSIS
|
---|
14 |
|
---|
15 | require HTML::LinkExtor;
|
---|
16 | $p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/");
|
---|
17 | sub cb {
|
---|
18 | my($tag, %links) = @_;
|
---|
19 | print "$tag @{[%links]}\n";
|
---|
20 | }
|
---|
21 | $p->parse_file("index.html");
|
---|
22 |
|
---|
23 | =head1 DESCRIPTION
|
---|
24 |
|
---|
25 | I<HTML::LinkExtor> is an HTML parser that extracts links from an
|
---|
26 | HTML document. The I<HTML::LinkExtor> is a subclass of
|
---|
27 | I<HTML::Parser>. This means that the document should be given to the
|
---|
28 | parser by calling the $p->parse() or $p->parse_file() methods.
|
---|
29 |
|
---|
30 | =cut
|
---|
31 |
|
---|
32 | use strict;
|
---|
33 | use HTML::Tagset ();
|
---|
34 |
|
---|
35 | # legacy (some applications grabs this hash directly)
|
---|
36 | use vars qw(%LINK_ELEMENT);
|
---|
37 | *LINK_ELEMENT = \%HTML::Tagset::linkElements;
|
---|
38 |
|
---|
39 | =over 4
|
---|
40 |
|
---|
41 | =item $p = HTML::LinkExtor->new
|
---|
42 |
|
---|
43 | =item $p = HTML::LinkExtor->new( $callback )
|
---|
44 |
|
---|
45 | =item $p = HTML::LinkExtor->new( $callback, $base )
|
---|
46 |
|
---|
47 | The constructor takes two optional arguments. The first is a reference
|
---|
48 | to a callback routine. It will be called as links are found. If a
|
---|
49 | callback is not provided, then links are just accumulated internally
|
---|
50 | and can be retrieved by calling the $p->links() method.
|
---|
51 |
|
---|
52 | The $base argument is an optional base URL used to absolutize all URLs found.
|
---|
53 | You need to have the I<URI> module installed if you provide $base.
|
---|
54 |
|
---|
55 | The callback is called with the lowercase tag name as first argument,
|
---|
56 | and then all link attributes as separate key/value pairs. All
|
---|
57 | non-link attributes are removed.
|
---|
58 |
|
---|
59 | =cut
|
---|
60 |
|
---|
61 | sub new
|
---|
62 | {
|
---|
63 | my($class, $cb, $base) = @_;
|
---|
64 | my $self = $class->SUPER::new(
|
---|
65 | start_h => ["_start_tag", "self,tagname,attr"],
|
---|
66 | report_tags => [keys %HTML::Tagset::linkElements],
|
---|
67 | );
|
---|
68 | $self->{extractlink_cb} = $cb;
|
---|
69 | if ($base) {
|
---|
70 | require URI;
|
---|
71 | $self->{extractlink_base} = URI->new($base);
|
---|
72 | }
|
---|
73 | $self;
|
---|
74 | }
|
---|
75 |
|
---|
76 | sub _start_tag
|
---|
77 | {
|
---|
78 | my($self, $tag, $attr) = @_;
|
---|
79 |
|
---|
80 | my $base = $self->{extractlink_base};
|
---|
81 | my $links = $HTML::Tagset::linkElements{$tag};
|
---|
82 | $links = [$links] unless ref $links;
|
---|
83 |
|
---|
84 | my @links;
|
---|
85 | my $a;
|
---|
86 | for $a (@$links) {
|
---|
87 | next unless exists $attr->{$a};
|
---|
88 | push(@links, $a, $base ? URI->new($attr->{$a}, $base)->abs($base)
|
---|
89 | : $attr->{$a});
|
---|
90 | }
|
---|
91 | return unless @links;
|
---|
92 | $self->_found_link($tag, @links);
|
---|
93 | }
|
---|
94 |
|
---|
95 | sub _found_link
|
---|
96 | {
|
---|
97 | my $self = shift;
|
---|
98 | my $cb = $self->{extractlink_cb};
|
---|
99 | if ($cb) {
|
---|
100 | &$cb(@_);
|
---|
101 | } else {
|
---|
102 | push(@{$self->{'links'}}, [@_]);
|
---|
103 | }
|
---|
104 | }
|
---|
105 |
|
---|
106 | =item $p->links
|
---|
107 |
|
---|
108 | Returns a list of all links found in the document. The returned
|
---|
109 | values will be anonymous arrays with the follwing elements:
|
---|
110 |
|
---|
111 | [$tag, $attr => $url1, $attr2 => $url2,...]
|
---|
112 |
|
---|
113 | The $p->links method will also truncate the internal link list. This
|
---|
114 | means that if the method is called twice without any parsing
|
---|
115 | between them the second call will return an empty list.
|
---|
116 |
|
---|
117 | Also note that $p->links will always be empty if a callback routine
|
---|
118 | was provided when the I<HTML::LinkExtor> was created.
|
---|
119 |
|
---|
120 | =cut
|
---|
121 |
|
---|
122 | sub links
|
---|
123 | {
|
---|
124 | my $self = shift;
|
---|
125 | exists($self->{'links'}) ? @{delete $self->{'links'}} : ();
|
---|
126 | }
|
---|
127 |
|
---|
128 | # We override the parse_file() method so that we can clear the links
|
---|
129 | # before we start a new file.
|
---|
130 | sub parse_file
|
---|
131 | {
|
---|
132 | my $self = shift;
|
---|
133 | delete $self->{'links'};
|
---|
134 | $self->SUPER::parse_file(@_);
|
---|
135 | }
|
---|
136 |
|
---|
137 | =back
|
---|
138 |
|
---|
139 | =head1 EXAMPLE
|
---|
140 |
|
---|
141 | This is an example showing how you can extract links from a document
|
---|
142 | received using LWP:
|
---|
143 |
|
---|
144 | use LWP::UserAgent;
|
---|
145 | use HTML::LinkExtor;
|
---|
146 | use URI::URL;
|
---|
147 |
|
---|
148 | $url = "http://www.perl.org/"; # for instance
|
---|
149 | $ua = LWP::UserAgent->new;
|
---|
150 |
|
---|
151 | # Set up a callback that collect image links
|
---|
152 | my @imgs = ();
|
---|
153 | sub callback {
|
---|
154 | my($tag, %attr) = @_;
|
---|
155 | return if $tag ne 'img'; # we only look closer at <img ...>
|
---|
156 | push(@imgs, values %attr);
|
---|
157 | }
|
---|
158 |
|
---|
159 | # Make the parser. Unfortunately, we don't know the base yet
|
---|
160 | # (it might be diffent from $url)
|
---|
161 | $p = HTML::LinkExtor->new(\&callback);
|
---|
162 |
|
---|
163 | # Request document and parse it as it arrives
|
---|
164 | $res = $ua->request(HTTP::Request->new(GET => $url),
|
---|
165 | sub {$p->parse($_[0])});
|
---|
166 |
|
---|
167 | # Expand all image URLs to absolute ones
|
---|
168 | my $base = $res->base;
|
---|
169 | @imgs = map { $_ = url($_, $base)->abs; } @imgs;
|
---|
170 |
|
---|
171 | # Print them out
|
---|
172 | print join("\n", @imgs), "\n";
|
---|
173 |
|
---|
174 | =head1 SEE ALSO
|
---|
175 |
|
---|
176 | L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL>
|
---|
177 |
|
---|
178 | =head1 COPYRIGHT
|
---|
179 |
|
---|
180 | Copyright 1996-2001 Gisle Aas.
|
---|
181 |
|
---|
182 | This library is free software; you can redistribute it and/or
|
---|
183 | modify it under the same terms as Perl itself.
|
---|
184 |
|
---|
185 | =cut
|
---|
186 |
|
---|
187 | 1;
|
---|