Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/perllib/cpan/HTML/LinkExtor.pm@ 14078

Last change on this file since 14078 was 14078, checked in by lh92, 17 years ago
Perl modules required for HTMLTidy
Property svn:keywords set to `Author Date Id Revision`
File size: 4.4 KB

Line
1	package HTML::LinkExtor;
2
3	# $Id: LinkExtor.pm 14078 2007-05-17 03:15:41Z lh92 $
4
5	require HTML::Parser;
6	@ISA = qw(HTML::Parser);
7	$VERSION = sprintf("%d.%02d", q$Revision: 14078 $ =~ /(\d+)\.(\d+)/);
8
9	=head1 NAME
10
11	HTML::LinkExtor - Extract links from an HTML document
12
13	=head1 SYNOPSIS
14
15	require HTML::LinkExtor;
16	$p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/");
17	sub cb {
18	my($tag, %links) = @_;
19	print "$tag @{[%links]}\n";
20	}
21	$p->parse_file("index.html");
22
23	=head1 DESCRIPTION
24
25	I<HTML::LinkExtor> is an HTML parser that extracts links from an
26	HTML document. The I<HTML::LinkExtor> is a subclass of
27	I<HTML::Parser>. This means that the document should be given to the
28	parser by calling the $p->parse() or $p->parse_file() methods.
29
30	=cut
31
32	use strict;
33	use HTML::Tagset ();
34
35	# legacy (some applications grabs this hash directly)
36	use vars qw(%LINK_ELEMENT);
37	*LINK_ELEMENT = \%HTML::Tagset::linkElements;
38
39	=over 4
40
41	=item $p = HTML::LinkExtor->new
42
43	=item $p = HTML::LinkExtor->new( $callback )
44
45	=item $p = HTML::LinkExtor->new( $callback, $base )
46
47	The constructor takes two optional arguments. The first is a reference
48	to a callback routine. It will be called as links are found. If a
49	callback is not provided, then links are just accumulated internally
50	and can be retrieved by calling the $p->links() method.
51
52	The $base argument is an optional base URL used to absolutize all URLs found.
53	You need to have the I<URI> module installed if you provide $base.
54
55	The callback is called with the lowercase tag name as first argument,
56	and then all link attributes as separate key/value pairs. All
57	non-link attributes are removed.
58
59	=cut
60
61	sub new
62	{
63	my($class, $cb, $base) = @_;
64	my $self = $class->SUPER::new(
65	start_h => ["_start_tag", "self,tagname,attr"],
66	report_tags => [keys %HTML::Tagset::linkElements],
67	);
68	$self->{extractlink_cb} = $cb;
69	if ($base) {
70	require URI;
71	$self->{extractlink_base} = URI->new($base);
72	}
73	$self;
74	}
75
76	sub _start_tag
77	{
78	my($self, $tag, $attr) = @_;
79
80	my $base = $self->{extractlink_base};
81	my $links = $HTML::Tagset::linkElements{$tag};
82	$links = [$links] unless ref $links;
83
84	my @links;
85	my $a;
86	for $a (@$links) {
87	next unless exists $attr->{$a};
88	push(@links, $a, $base ? URI->new($attr->{$a}, $base)->abs($base)
89	: $attr->{$a});
90	}
91	return unless @links;
92	$self->_found_link($tag, @links);
93	}
94
95	sub _found_link
96	{
97	my $self = shift;
98	my $cb = $self->{extractlink_cb};
99	if ($cb) {
100	&$cb(@_);
101	} else {
102	push(@{$self->{'links'}}, [@_]);
103	}
104	}
105
106	=item $p->links
107
108	Returns a list of all links found in the document. The returned
109	values will be anonymous arrays with the follwing elements:
110
111	[$tag, $attr => $url1, $attr2 => $url2,...]
112
113	The $p->links method will also truncate the internal link list. This
114	means that if the method is called twice without any parsing
115	between them the second call will return an empty list.
116
117	Also note that $p->links will always be empty if a callback routine
118	was provided when the I<HTML::LinkExtor> was created.
119
120	=cut
121
122	sub links
123	{
124	my $self = shift;
125	exists($self->{'links'}) ? @{delete $self->{'links'}} : ();
126	}
127
128	# We override the parse_file() method so that we can clear the links
129	# before we start a new file.
130	sub parse_file
131	{
132	my $self = shift;
133	delete $self->{'links'};
134	$self->SUPER::parse_file(@_);
135	}
136
137	=back
138
139	=head1 EXAMPLE
140
141	This is an example showing how you can extract links from a document
142	received using LWP:
143
144	use LWP::UserAgent;
145	use HTML::LinkExtor;
146	use URI::URL;
147
148	$url = "http://www.perl.org/"; # for instance
149	$ua = LWP::UserAgent->new;
150
151	# Set up a callback that collect image links
152	my @imgs = ();
153	sub callback {
154	my($tag, %attr) = @_;
155	return if $tag ne 'img'; # we only look closer at <img ...>
156	push(@imgs, values %attr);
157	}
158
159	# Make the parser. Unfortunately, we don't know the base yet
160	# (it might be diffent from $url)
161	$p = HTML::LinkExtor->new(\&callback);
162
163	# Request document and parse it as it arrives
164	$res = $ua->request(HTTP::Request->new(GET => $url),
165	sub {$p->parse($_[0])});
166
167	# Expand all image URLs to absolute ones
168	my $base = $res->base;
169	@imgs = map { $_ = url($_, $base)->abs; } @imgs;
170
171	# Print them out
172	print join("\n", @imgs), "\n";
173
174	=head1 SEE ALSO
175
176	L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL>
177
178	=head1 COPYRIGHT
179
180	Copyright 1996-2001 Gisle Aas.
181
182	This library is free software; you can redistribute it and/or
183	modify it under the same terms as Perl itself.
184
185	=cut
186
187	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: