1 | package HTML::HeadParser;
|
---|
2 |
|
---|
3 | =head1 NAME
|
---|
4 |
|
---|
5 | HTML::HeadParser - Parse <HEAD> section of a HTML document
|
---|
6 |
|
---|
7 | =head1 SYNOPSIS
|
---|
8 |
|
---|
9 | require HTML::HeadParser;
|
---|
10 | $p = HTML::HeadParser->new;
|
---|
11 | $p->parse($text) and print "not finished";
|
---|
12 |
|
---|
13 | $p->header('Title') # to access <title>....</title>
|
---|
14 | $p->header('Content-Base') # to access <base href="http://...">
|
---|
15 | $p->header('Foo') # to access <meta http-equiv="Foo" content="...">
|
---|
16 |
|
---|
17 | =head1 DESCRIPTION
|
---|
18 |
|
---|
19 | The C<HTML::HeadParser> is a specialized (and lightweight)
|
---|
20 | C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD>
|
---|
21 | section of an HTML document. The parse() method
|
---|
22 | will return a FALSE value as soon as some E<lt>BODY> element or body
|
---|
23 | text are found, and should not be called again after this.
|
---|
24 |
|
---|
25 | Note that the C<HTML::HeadParser> might get confused if raw undecoded
|
---|
26 | UTF-8 is passed to the parse() method. Make sure the strings are
|
---|
27 | properly decoded before passing them on.
|
---|
28 |
|
---|
29 | The C<HTML::HeadParser> keeps a reference to a header object, and the
|
---|
30 | parser will update this header object as the various elements of the
|
---|
31 | E<lt>HEAD> section of the HTML document are recognized. The following
|
---|
32 | header fields are affected:
|
---|
33 |
|
---|
34 | =over 4
|
---|
35 |
|
---|
36 | =item Content-Base:
|
---|
37 |
|
---|
38 | The I<Content-Base> header is initialized from the E<lt>base
|
---|
39 | href="..."> element.
|
---|
40 |
|
---|
41 | =item Title:
|
---|
42 |
|
---|
43 | The I<Title> header is initialized from the E<lt>title>...E<lt>/title>
|
---|
44 | element.
|
---|
45 |
|
---|
46 | =item Isindex:
|
---|
47 |
|
---|
48 | The I<Isindex> header will be added if there is a E<lt>isindex>
|
---|
49 | element in the E<lt>head>. The header value is initialized from the
|
---|
50 | I<prompt> attribute if it is present. If no I<prompt> attribute is
|
---|
51 | given it will have '?' as the value.
|
---|
52 |
|
---|
53 | =item X-Meta-Foo:
|
---|
54 |
|
---|
55 | All E<lt>meta> elements will initialize headers with the prefix
|
---|
56 | "C<X-Meta->" on the name. If the E<lt>meta> element contains a
|
---|
57 | C<http-equiv> attribute, then it will be honored as the header name.
|
---|
58 |
|
---|
59 | =back
|
---|
60 |
|
---|
61 | =head1 METHODS
|
---|
62 |
|
---|
63 | The following methods (in addition to those provided by the
|
---|
64 | superclass) are available:
|
---|
65 |
|
---|
66 | =over 4
|
---|
67 |
|
---|
68 | =cut
|
---|
69 |
|
---|
70 |
|
---|
71 | require HTML::Parser;
|
---|
72 | @ISA = qw(HTML::Parser);
|
---|
73 |
|
---|
74 | use HTML::Entities ();
|
---|
75 |
|
---|
76 | use strict;
|
---|
77 | use vars qw($VERSION $DEBUG);
|
---|
78 | #$DEBUG = 1;
|
---|
79 | $VERSION = sprintf("%d.%02d", q$Revision: 14078 $ =~ /(\d+)\.(\d+)/);
|
---|
80 |
|
---|
81 | =item $hp = HTML::HeadParser->new
|
---|
82 |
|
---|
83 | =item $hp = HTML::HeadParser->new( $header )
|
---|
84 |
|
---|
85 | The object constructor. The optional $header argument should be a
|
---|
86 | reference to an object that implement the header() and push_header()
|
---|
87 | methods as defined by the C<HTTP::Headers> class. Normally it will be
|
---|
88 | of some class that isa or delegates to the C<HTTP::Headers> class.
|
---|
89 |
|
---|
90 | If no $header is given C<HTML::HeadParser> will create an
|
---|
91 | C<HTTP::Header> object by itself (initially empty).
|
---|
92 |
|
---|
93 | =cut
|
---|
94 |
|
---|
95 | sub new
|
---|
96 | {
|
---|
97 | my($class, $header) = @_;
|
---|
98 | unless ($header) {
|
---|
99 | require HTTP::Headers;
|
---|
100 | $header = HTTP::Headers->new;
|
---|
101 | }
|
---|
102 |
|
---|
103 | my $self = $class->SUPER::new(api_version => 2,
|
---|
104 | ignore_elements => [qw(script style)],
|
---|
105 | );
|
---|
106 | $self->{'header'} = $header;
|
---|
107 | $self->{'tag'} = ''; # name of active element that takes textual content
|
---|
108 | $self->{'text'} = ''; # the accumulated text associated with the element
|
---|
109 | $self;
|
---|
110 | }
|
---|
111 |
|
---|
112 | =item $hp->header;
|
---|
113 |
|
---|
114 | Returns a reference to the header object.
|
---|
115 |
|
---|
116 | =item $hp->header( $key )
|
---|
117 |
|
---|
118 | Returns a header value. It is just a shorter way to write
|
---|
119 | C<$hp-E<gt>header-E<gt>header($key)>.
|
---|
120 |
|
---|
121 | =cut
|
---|
122 |
|
---|
123 | sub header
|
---|
124 | {
|
---|
125 | my $self = shift;
|
---|
126 | return $self->{'header'} unless @_;
|
---|
127 | $self->{'header'}->header(@_);
|
---|
128 | }
|
---|
129 |
|
---|
130 | sub as_string # legacy
|
---|
131 | {
|
---|
132 | my $self = shift;
|
---|
133 | $self->{'header'}->as_string;
|
---|
134 | }
|
---|
135 |
|
---|
136 | sub flush_text # internal
|
---|
137 | {
|
---|
138 | my $self = shift;
|
---|
139 | my $tag = $self->{'tag'};
|
---|
140 | my $text = $self->{'text'};
|
---|
141 | $text =~ s/^\s+//;
|
---|
142 | $text =~ s/\s+$//;
|
---|
143 | $text =~ s/\s+/ /g;
|
---|
144 | print "FLUSH $tag => '$text'\n" if $DEBUG;
|
---|
145 | if ($tag eq 'title') {
|
---|
146 | HTML::Entities::decode($text);
|
---|
147 | $self->{'header'}->push_header(Title => $text);
|
---|
148 | }
|
---|
149 | $self->{'tag'} = $self->{'text'} = '';
|
---|
150 | }
|
---|
151 |
|
---|
152 | # This is an quote from the HTML3.2 DTD which shows which elements
|
---|
153 | # that might be present in a <HEAD>...</HEAD>. Also note that the
|
---|
154 | # <HEAD> tags themselves might be missing:
|
---|
155 | #
|
---|
156 | # <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? &
|
---|
157 | # SCRIPT* & META* & LINK*">
|
---|
158 | #
|
---|
159 | # <!ELEMENT HEAD O O (%head.content)>
|
---|
160 |
|
---|
161 |
|
---|
162 | sub start
|
---|
163 | {
|
---|
164 | my($self, $tag, $attr) = @_; # $attr is reference to a HASH
|
---|
165 | print "START[$tag]\n" if $DEBUG;
|
---|
166 | $self->flush_text if $self->{'tag'};
|
---|
167 | if ($tag eq 'meta') {
|
---|
168 | my $key = $attr->{'http-equiv'};
|
---|
169 | if (!defined($key) || !length($key)) {
|
---|
170 | return unless $attr->{'name'};
|
---|
171 | $key = "X-Meta-\u$attr->{'name'}";
|
---|
172 | }
|
---|
173 | $self->{'header'}->push_header($key => $attr->{content});
|
---|
174 | } elsif ($tag eq 'base') {
|
---|
175 | return unless exists $attr->{href};
|
---|
176 | $self->{'header'}->push_header('Content-Base' => $attr->{href});
|
---|
177 | } elsif ($tag eq 'isindex') {
|
---|
178 | # This is a non-standard header. Perhaps we should just ignore
|
---|
179 | # this element
|
---|
180 | $self->{'header'}->push_header(Isindex => $attr->{prompt} || '?');
|
---|
181 | } elsif ($tag =~ /^(?:title|script|style)$/) {
|
---|
182 | # Just remember tag. Initialize header when we see the end tag.
|
---|
183 | $self->{'tag'} = $tag;
|
---|
184 | } elsif ($tag eq 'link') {
|
---|
185 | return unless exists $attr->{href};
|
---|
186 | # <link href="http:..." rel="xxx" rev="xxx" title="xxx">
|
---|
187 | my $h_val = "<" . delete($attr->{href}) . ">";
|
---|
188 | for (sort keys %{$attr}) {
|
---|
189 | $h_val .= qq(; $_="$attr->{$_}");
|
---|
190 | }
|
---|
191 | $self->{'header'}->push_header(Link => $h_val);
|
---|
192 | } elsif ($tag eq 'head' || $tag eq 'html') {
|
---|
193 | # ignore
|
---|
194 | } else {
|
---|
195 | # stop parsing
|
---|
196 | $self->eof;
|
---|
197 | }
|
---|
198 | }
|
---|
199 |
|
---|
200 | sub end
|
---|
201 | {
|
---|
202 | my($self, $tag) = @_;
|
---|
203 | print "END[$tag]\n" if $DEBUG;
|
---|
204 | $self->flush_text if $self->{'tag'};
|
---|
205 | $self->eof if $tag eq 'head';
|
---|
206 | }
|
---|
207 |
|
---|
208 | sub text
|
---|
209 | {
|
---|
210 | my($self, $text) = @_;
|
---|
211 | $text =~ s/\x{FEFF}//; # drop Unicode BOM if found
|
---|
212 | print "TEXT[$text]\n" if $DEBUG;
|
---|
213 | my $tag = $self->{tag};
|
---|
214 | if (!$tag && $text =~ /\S/) {
|
---|
215 | # Normal text means start of body
|
---|
216 | $self->eof;
|
---|
217 | return;
|
---|
218 | }
|
---|
219 | return if $tag ne 'title';
|
---|
220 | $self->{'text'} .= $text;
|
---|
221 | }
|
---|
222 |
|
---|
223 | 1;
|
---|
224 |
|
---|
225 | __END__
|
---|
226 |
|
---|
227 | =back
|
---|
228 |
|
---|
229 | =head1 EXAMPLE
|
---|
230 |
|
---|
231 | $h = HTTP::Headers->new;
|
---|
232 | $p = HTML::HeadParser->new($h);
|
---|
233 | $p->parse(<<EOT);
|
---|
234 | <title>Stupid example</title>
|
---|
235 | <base href="http://www.linpro.no/lwp/">
|
---|
236 | Normal text starts here.
|
---|
237 | EOT
|
---|
238 | undef $p;
|
---|
239 | print $h->title; # should print "Stupid example"
|
---|
240 |
|
---|
241 | =head1 SEE ALSO
|
---|
242 |
|
---|
243 | L<HTML::Parser>, L<HTTP::Headers>
|
---|
244 |
|
---|
245 | The C<HTTP::Headers> class is distributed as part of the
|
---|
246 | I<libwww-perl> package. If you don't have that distribution installed
|
---|
247 | you need to provide the $header argument to the C<HTML::HeadParser>
|
---|
248 | constructor with your own object that implements the documented
|
---|
249 | protocol.
|
---|
250 |
|
---|
251 | =head1 COPYRIGHT
|
---|
252 |
|
---|
253 | Copyright 1996-2001 Gisle Aas. All rights reserved.
|
---|
254 |
|
---|
255 | This library is free software; you can redistribute it and/or
|
---|
256 | modify it under the same terms as Perl itself.
|
---|
257 |
|
---|
258 | =cut
|
---|
259 |
|
---|