source: trunk/gsdl/perllib/cpan/HTML/LinkExtor.pm@ 14078

Last change on this file since 14078 was 14078, checked in by lh92, 17 years ago

Perl modules required for HTMLTidy

  • Property svn:keywords set to Author Date Id Revision
File size: 4.4 KB
Line 
1package HTML::LinkExtor;
2
3# $Id: LinkExtor.pm 14078 2007-05-17 03:15:41Z lh92 $
4
5require HTML::Parser;
6@ISA = qw(HTML::Parser);
7$VERSION = sprintf("%d.%02d", q$Revision: 14078 $ =~ /(\d+)\.(\d+)/);
8
9=head1 NAME
10
11HTML::LinkExtor - Extract links from an HTML document
12
13=head1 SYNOPSIS
14
15 require HTML::LinkExtor;
16 $p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/");
17 sub cb {
18 my($tag, %links) = @_;
19 print "$tag @{[%links]}\n";
20 }
21 $p->parse_file("index.html");
22
23=head1 DESCRIPTION
24
25I<HTML::LinkExtor> is an HTML parser that extracts links from an
26HTML document. The I<HTML::LinkExtor> is a subclass of
27I<HTML::Parser>. This means that the document should be given to the
28parser by calling the $p->parse() or $p->parse_file() methods.
29
30=cut
31
32use strict;
33use HTML::Tagset ();
34
35# legacy (some applications grabs this hash directly)
36use vars qw(%LINK_ELEMENT);
37*LINK_ELEMENT = \%HTML::Tagset::linkElements;
38
39=over 4
40
41=item $p = HTML::LinkExtor->new
42
43=item $p = HTML::LinkExtor->new( $callback )
44
45=item $p = HTML::LinkExtor->new( $callback, $base )
46
47The constructor takes two optional arguments. The first is a reference
48to a callback routine. It will be called as links are found. If a
49callback is not provided, then links are just accumulated internally
50and can be retrieved by calling the $p->links() method.
51
52The $base argument is an optional base URL used to absolutize all URLs found.
53You need to have the I<URI> module installed if you provide $base.
54
55The callback is called with the lowercase tag name as first argument,
56and then all link attributes as separate key/value pairs. All
57non-link attributes are removed.
58
59=cut
60
61sub new
62{
63 my($class, $cb, $base) = @_;
64 my $self = $class->SUPER::new(
65 start_h => ["_start_tag", "self,tagname,attr"],
66 report_tags => [keys %HTML::Tagset::linkElements],
67 );
68 $self->{extractlink_cb} = $cb;
69 if ($base) {
70 require URI;
71 $self->{extractlink_base} = URI->new($base);
72 }
73 $self;
74}
75
76sub _start_tag
77{
78 my($self, $tag, $attr) = @_;
79
80 my $base = $self->{extractlink_base};
81 my $links = $HTML::Tagset::linkElements{$tag};
82 $links = [$links] unless ref $links;
83
84 my @links;
85 my $a;
86 for $a (@$links) {
87 next unless exists $attr->{$a};
88 push(@links, $a, $base ? URI->new($attr->{$a}, $base)->abs($base)
89 : $attr->{$a});
90 }
91 return unless @links;
92 $self->_found_link($tag, @links);
93}
94
95sub _found_link
96{
97 my $self = shift;
98 my $cb = $self->{extractlink_cb};
99 if ($cb) {
100 &$cb(@_);
101 } else {
102 push(@{$self->{'links'}}, [@_]);
103 }
104}
105
106=item $p->links
107
108Returns a list of all links found in the document. The returned
109values will be anonymous arrays with the follwing elements:
110
111 [$tag, $attr => $url1, $attr2 => $url2,...]
112
113The $p->links method will also truncate the internal link list. This
114means that if the method is called twice without any parsing
115between them the second call will return an empty list.
116
117Also note that $p->links will always be empty if a callback routine
118was provided when the I<HTML::LinkExtor> was created.
119
120=cut
121
122sub links
123{
124 my $self = shift;
125 exists($self->{'links'}) ? @{delete $self->{'links'}} : ();
126}
127
128# We override the parse_file() method so that we can clear the links
129# before we start a new file.
130sub parse_file
131{
132 my $self = shift;
133 delete $self->{'links'};
134 $self->SUPER::parse_file(@_);
135}
136
137=back
138
139=head1 EXAMPLE
140
141This is an example showing how you can extract links from a document
142received using LWP:
143
144 use LWP::UserAgent;
145 use HTML::LinkExtor;
146 use URI::URL;
147
148 $url = "http://www.perl.org/"; # for instance
149 $ua = LWP::UserAgent->new;
150
151 # Set up a callback that collect image links
152 my @imgs = ();
153 sub callback {
154 my($tag, %attr) = @_;
155 return if $tag ne 'img'; # we only look closer at <img ...>
156 push(@imgs, values %attr);
157 }
158
159 # Make the parser. Unfortunately, we don't know the base yet
160 # (it might be diffent from $url)
161 $p = HTML::LinkExtor->new(\&callback);
162
163 # Request document and parse it as it arrives
164 $res = $ua->request(HTTP::Request->new(GET => $url),
165 sub {$p->parse($_[0])});
166
167 # Expand all image URLs to absolute ones
168 my $base = $res->base;
169 @imgs = map { $_ = url($_, $base)->abs; } @imgs;
170
171 # Print them out
172 print join("\n", @imgs), "\n";
173
174=head1 SEE ALSO
175
176L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL>
177
178=head1 COPYRIGHT
179
180Copyright 1996-2001 Gisle Aas.
181
182This library is free software; you can redistribute it and/or
183modify it under the same terms as Perl itself.
184
185=cut
186
1871;
Note: See TracBrowser for help on using the repository browser.