[24626] | 1 | package HTML::HeadParser;
|
---|
| 2 |
|
---|
| 3 | =head1 NAME
|
---|
| 4 |
|
---|
| 5 | HTML::HeadParser - Parse <HEAD> section of a HTML document
|
---|
| 6 |
|
---|
| 7 | =head1 SYNOPSIS
|
---|
| 8 |
|
---|
| 9 | require HTML::HeadParser;
|
---|
| 10 | $p = HTML::HeadParser->new;
|
---|
| 11 | $p->parse($text) and print "not finished";
|
---|
| 12 |
|
---|
| 13 | $p->header('Title') # to access <title>....</title>
|
---|
| 14 | $p->header('Content-Base') # to access <base href="http://...">
|
---|
| 15 | $p->header('Foo') # to access <meta http-equiv="Foo" content="...">
|
---|
| 16 |
|
---|
| 17 | =head1 DESCRIPTION
|
---|
| 18 |
|
---|
| 19 | The C<HTML::HeadParser> is a specialized (and lightweight)
|
---|
| 20 | C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD>
|
---|
| 21 | section of an HTML document. The parse() method
|
---|
| 22 | will return a FALSE value as soon as some E<lt>BODY> element or body
|
---|
| 23 | text are found, and should not be called again after this.
|
---|
| 24 |
|
---|
| 25 | Note that the C<HTML::HeadParser> might get confused if raw undecoded
|
---|
| 26 | UTF-8 is passed to the parse() method. Make sure the strings are
|
---|
| 27 | properly decoded before passing them on.
|
---|
| 28 |
|
---|
| 29 | The C<HTML::HeadParser> keeps a reference to a header object, and the
|
---|
| 30 | parser will update this header object as the various elements of the
|
---|
| 31 | E<lt>HEAD> section of the HTML document are recognized. The following
|
---|
| 32 | header fields are affected:
|
---|
| 33 |
|
---|
| 34 | =over 4
|
---|
| 35 |
|
---|
| 36 | =item Content-Base:
|
---|
| 37 |
|
---|
| 38 | The I<Content-Base> header is initialized from the E<lt>base
|
---|
| 39 | href="..."> element.
|
---|
| 40 |
|
---|
| 41 | =item Title:
|
---|
| 42 |
|
---|
| 43 | The I<Title> header is initialized from the E<lt>title>...E<lt>/title>
|
---|
| 44 | element.
|
---|
| 45 |
|
---|
| 46 | =item Isindex:
|
---|
| 47 |
|
---|
| 48 | The I<Isindex> header will be added if there is a E<lt>isindex>
|
---|
| 49 | element in the E<lt>head>. The header value is initialized from the
|
---|
| 50 | I<prompt> attribute if it is present. If no I<prompt> attribute is
|
---|
| 51 | given it will have '?' as the value.
|
---|
| 52 |
|
---|
| 53 | =item X-Meta-Foo:
|
---|
| 54 |
|
---|
| 55 | All E<lt>meta> elements will initialize headers with the prefix
|
---|
| 56 | "C<X-Meta->" on the name. If the E<lt>meta> element contains a
|
---|
| 57 | C<http-equiv> attribute, then it will be honored as the header name.
|
---|
| 58 |
|
---|
| 59 | =back
|
---|
| 60 |
|
---|
| 61 | =head1 METHODS
|
---|
| 62 |
|
---|
| 63 | The following methods (in addition to those provided by the
|
---|
| 64 | superclass) are available:
|
---|
| 65 |
|
---|
| 66 | =over 4
|
---|
| 67 |
|
---|
| 68 | =cut
|
---|
| 69 |
|
---|
| 70 |
|
---|
| 71 | require HTML::Parser;
|
---|
| 72 | @ISA = qw(HTML::Parser);
|
---|
| 73 |
|
---|
| 74 | use HTML::Entities ();
|
---|
| 75 |
|
---|
| 76 | use strict;
|
---|
| 77 | use vars qw($VERSION $DEBUG);
|
---|
| 78 | #$DEBUG = 1;
|
---|
| 79 | $VERSION = sprintf("%d.%02d", q$Revision: 14078 $ =~ /(\d+)\.(\d+)/);
|
---|
| 80 |
|
---|
| 81 | =item $hp = HTML::HeadParser->new
|
---|
| 82 |
|
---|
| 83 | =item $hp = HTML::HeadParser->new( $header )
|
---|
| 84 |
|
---|
| 85 | The object constructor. The optional $header argument should be a
|
---|
| 86 | reference to an object that implement the header() and push_header()
|
---|
| 87 | methods as defined by the C<HTTP::Headers> class. Normally it will be
|
---|
| 88 | of some class that isa or delegates to the C<HTTP::Headers> class.
|
---|
| 89 |
|
---|
| 90 | If no $header is given C<HTML::HeadParser> will create an
|
---|
| 91 | C<HTTP::Header> object by itself (initially empty).
|
---|
| 92 |
|
---|
| 93 | =cut
|
---|
| 94 |
|
---|
| 95 | sub new
|
---|
| 96 | {
|
---|
| 97 | my($class, $header) = @_;
|
---|
| 98 | unless ($header) {
|
---|
| 99 | require HTTP::Headers;
|
---|
| 100 | $header = HTTP::Headers->new;
|
---|
| 101 | }
|
---|
| 102 |
|
---|
| 103 | my $self = $class->SUPER::new(api_version => 2,
|
---|
| 104 | ignore_elements => [qw(script style)],
|
---|
| 105 | );
|
---|
| 106 | $self->{'header'} = $header;
|
---|
| 107 | $self->{'tag'} = ''; # name of active element that takes textual content
|
---|
| 108 | $self->{'text'} = ''; # the accumulated text associated with the element
|
---|
| 109 | $self;
|
---|
| 110 | }
|
---|
| 111 |
|
---|
| 112 | =item $hp->header;
|
---|
| 113 |
|
---|
| 114 | Returns a reference to the header object.
|
---|
| 115 |
|
---|
| 116 | =item $hp->header( $key )
|
---|
| 117 |
|
---|
| 118 | Returns a header value. It is just a shorter way to write
|
---|
| 119 | C<$hp-E<gt>header-E<gt>header($key)>.
|
---|
| 120 |
|
---|
| 121 | =cut
|
---|
| 122 |
|
---|
| 123 | sub header
|
---|
| 124 | {
|
---|
| 125 | my $self = shift;
|
---|
| 126 | return $self->{'header'} unless @_;
|
---|
| 127 | $self->{'header'}->header(@_);
|
---|
| 128 | }
|
---|
| 129 |
|
---|
| 130 | sub as_string # legacy
|
---|
| 131 | {
|
---|
| 132 | my $self = shift;
|
---|
| 133 | $self->{'header'}->as_string;
|
---|
| 134 | }
|
---|
| 135 |
|
---|
| 136 | sub flush_text # internal
|
---|
| 137 | {
|
---|
| 138 | my $self = shift;
|
---|
| 139 | my $tag = $self->{'tag'};
|
---|
| 140 | my $text = $self->{'text'};
|
---|
| 141 | $text =~ s/^\s+//;
|
---|
| 142 | $text =~ s/\s+$//;
|
---|
| 143 | $text =~ s/\s+/ /g;
|
---|
| 144 | print "FLUSH $tag => '$text'\n" if $DEBUG;
|
---|
| 145 | if ($tag eq 'title') {
|
---|
| 146 | HTML::Entities::decode($text);
|
---|
| 147 | $self->{'header'}->push_header(Title => $text);
|
---|
| 148 | }
|
---|
| 149 | $self->{'tag'} = $self->{'text'} = '';
|
---|
| 150 | }
|
---|
| 151 |
|
---|
| 152 | # This is an quote from the HTML3.2 DTD which shows which elements
|
---|
| 153 | # that might be present in a <HEAD>...</HEAD>. Also note that the
|
---|
| 154 | # <HEAD> tags themselves might be missing:
|
---|
| 155 | #
|
---|
| 156 | # <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? &
|
---|
| 157 | # SCRIPT* & META* & LINK*">
|
---|
| 158 | #
|
---|
| 159 | # <!ELEMENT HEAD O O (%head.content)>
|
---|
| 160 |
|
---|
| 161 |
|
---|
| 162 | sub start
|
---|
| 163 | {
|
---|
| 164 | my($self, $tag, $attr) = @_; # $attr is reference to a HASH
|
---|
| 165 | print "START[$tag]\n" if $DEBUG;
|
---|
| 166 | $self->flush_text if $self->{'tag'};
|
---|
| 167 | if ($tag eq 'meta') {
|
---|
| 168 | my $key = $attr->{'http-equiv'};
|
---|
| 169 | if (!defined($key) || !length($key)) {
|
---|
| 170 | return unless $attr->{'name'};
|
---|
| 171 | $key = "X-Meta-\u$attr->{'name'}";
|
---|
| 172 | }
|
---|
| 173 | $self->{'header'}->push_header($key => $attr->{content});
|
---|
| 174 | } elsif ($tag eq 'base') {
|
---|
| 175 | return unless exists $attr->{href};
|
---|
| 176 | $self->{'header'}->push_header('Content-Base' => $attr->{href});
|
---|
| 177 | } elsif ($tag eq 'isindex') {
|
---|
| 178 | # This is a non-standard header. Perhaps we should just ignore
|
---|
| 179 | # this element
|
---|
| 180 | $self->{'header'}->push_header(Isindex => $attr->{prompt} || '?');
|
---|
| 181 | } elsif ($tag =~ /^(?:title|script|style)$/) {
|
---|
| 182 | # Just remember tag. Initialize header when we see the end tag.
|
---|
| 183 | $self->{'tag'} = $tag;
|
---|
| 184 | } elsif ($tag eq 'link') {
|
---|
| 185 | return unless exists $attr->{href};
|
---|
| 186 | # <link href="http:..." rel="xxx" rev="xxx" title="xxx">
|
---|
| 187 | my $h_val = "<" . delete($attr->{href}) . ">";
|
---|
| 188 | for (sort keys %{$attr}) {
|
---|
| 189 | $h_val .= qq(; $_="$attr->{$_}");
|
---|
| 190 | }
|
---|
| 191 | $self->{'header'}->push_header(Link => $h_val);
|
---|
| 192 | } elsif ($tag eq 'head' || $tag eq 'html') {
|
---|
| 193 | # ignore
|
---|
| 194 | } else {
|
---|
| 195 | # stop parsing
|
---|
| 196 | $self->eof;
|
---|
| 197 | }
|
---|
| 198 | }
|
---|
| 199 |
|
---|
| 200 | sub end
|
---|
| 201 | {
|
---|
| 202 | my($self, $tag) = @_;
|
---|
| 203 | print "END[$tag]\n" if $DEBUG;
|
---|
| 204 | $self->flush_text if $self->{'tag'};
|
---|
| 205 | $self->eof if $tag eq 'head';
|
---|
| 206 | }
|
---|
| 207 |
|
---|
| 208 | sub text
|
---|
| 209 | {
|
---|
| 210 | my($self, $text) = @_;
|
---|
| 211 | $text =~ s/\x{FEFF}//; # drop Unicode BOM if found
|
---|
| 212 | print "TEXT[$text]\n" if $DEBUG;
|
---|
| 213 | my $tag = $self->{tag};
|
---|
| 214 | if (!$tag && $text =~ /\S/) {
|
---|
| 215 | # Normal text means start of body
|
---|
| 216 | $self->eof;
|
---|
| 217 | return;
|
---|
| 218 | }
|
---|
| 219 | return if $tag ne 'title';
|
---|
| 220 | $self->{'text'} .= $text;
|
---|
| 221 | }
|
---|
| 222 |
|
---|
| 223 | 1;
|
---|
| 224 |
|
---|
| 225 | __END__
|
---|
| 226 |
|
---|
| 227 | =back
|
---|
| 228 |
|
---|
| 229 | =head1 EXAMPLE
|
---|
| 230 |
|
---|
| 231 | $h = HTTP::Headers->new;
|
---|
| 232 | $p = HTML::HeadParser->new($h);
|
---|
| 233 | $p->parse(<<EOT);
|
---|
| 234 | <title>Stupid example</title>
|
---|
| 235 | <base href="http://www.linpro.no/lwp/">
|
---|
| 236 | Normal text starts here.
|
---|
| 237 | EOT
|
---|
| 238 | undef $p;
|
---|
| 239 | print $h->title; # should print "Stupid example"
|
---|
| 240 |
|
---|
| 241 | =head1 SEE ALSO
|
---|
| 242 |
|
---|
| 243 | L<HTML::Parser>, L<HTTP::Headers>
|
---|
| 244 |
|
---|
| 245 | The C<HTTP::Headers> class is distributed as part of the
|
---|
| 246 | I<libwww-perl> package. If you don't have that distribution installed
|
---|
| 247 | you need to provide the $header argument to the C<HTML::HeadParser>
|
---|
| 248 | constructor with your own object that implements the documented
|
---|
| 249 | protocol.
|
---|
| 250 |
|
---|
| 251 | =head1 COPYRIGHT
|
---|
| 252 |
|
---|
| 253 | Copyright 1996-2001 Gisle Aas. All rights reserved.
|
---|
| 254 |
|
---|
| 255 | This library is free software; you can redistribute it and/or
|
---|
| 256 | modify it under the same terms as Perl itself.
|
---|
| 257 |
|
---|
| 258 | =cut
|
---|
| 259 |
|
---|