source: gs2-extensions/parallel-building/trunk/src/perllib/cpan/HTML/HeadParser.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago

An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)

File size: 6.6 KB
Line 
1package HTML::HeadParser;
2
3=head1 NAME
4
5HTML::HeadParser - Parse <HEAD> section of a HTML document
6
7=head1 SYNOPSIS
8
9 require HTML::HeadParser;
10 $p = HTML::HeadParser->new;
11 $p->parse($text) and print "not finished";
12
13 $p->header('Title') # to access <title>....</title>
14 $p->header('Content-Base') # to access <base href="http://...">
15 $p->header('Foo') # to access <meta http-equiv="Foo" content="...">
16
17=head1 DESCRIPTION
18
19The C<HTML::HeadParser> is a specialized (and lightweight)
20C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD>
21section of an HTML document. The parse() method
22will return a FALSE value as soon as some E<lt>BODY> element or body
23text are found, and should not be called again after this.
24
25Note that the C<HTML::HeadParser> might get confused if raw undecoded
26UTF-8 is passed to the parse() method. Make sure the strings are
27properly decoded before passing them on.
28
29The C<HTML::HeadParser> keeps a reference to a header object, and the
30parser will update this header object as the various elements of the
31E<lt>HEAD> section of the HTML document are recognized. The following
32header fields are affected:
33
34=over 4
35
36=item Content-Base:
37
38The I<Content-Base> header is initialized from the E<lt>base
39href="..."> element.
40
41=item Title:
42
43The I<Title> header is initialized from the E<lt>title>...E<lt>/title>
44element.
45
46=item Isindex:
47
48The I<Isindex> header will be added if there is a E<lt>isindex>
49element in the E<lt>head>. The header value is initialized from the
50I<prompt> attribute if it is present. If no I<prompt> attribute is
51given it will have '?' as the value.
52
53=item X-Meta-Foo:
54
55All E<lt>meta> elements will initialize headers with the prefix
56"C<X-Meta->" on the name. If the E<lt>meta> element contains a
57C<http-equiv> attribute, then it will be honored as the header name.
58
59=back
60
61=head1 METHODS
62
63The following methods (in addition to those provided by the
64superclass) are available:
65
66=over 4
67
68=cut
69
70
71require HTML::Parser;
72@ISA = qw(HTML::Parser);
73
74use HTML::Entities ();
75
76use strict;
77use vars qw($VERSION $DEBUG);
78#$DEBUG = 1;
79$VERSION = sprintf("%d.%02d", q$Revision: 14078 $ =~ /(\d+)\.(\d+)/);
80
81=item $hp = HTML::HeadParser->new
82
83=item $hp = HTML::HeadParser->new( $header )
84
85The object constructor. The optional $header argument should be a
86reference to an object that implement the header() and push_header()
87methods as defined by the C<HTTP::Headers> class. Normally it will be
88of some class that isa or delegates to the C<HTTP::Headers> class.
89
90If no $header is given C<HTML::HeadParser> will create an
91C<HTTP::Header> object by itself (initially empty).
92
93=cut
94
95sub new
96{
97 my($class, $header) = @_;
98 unless ($header) {
99 require HTTP::Headers;
100 $header = HTTP::Headers->new;
101 }
102
103 my $self = $class->SUPER::new(api_version => 2,
104 ignore_elements => [qw(script style)],
105 );
106 $self->{'header'} = $header;
107 $self->{'tag'} = ''; # name of active element that takes textual content
108 $self->{'text'} = ''; # the accumulated text associated with the element
109 $self;
110}
111
112=item $hp->header;
113
114Returns a reference to the header object.
115
116=item $hp->header( $key )
117
118Returns a header value. It is just a shorter way to write
119C<$hp-E<gt>header-E<gt>header($key)>.
120
121=cut
122
123sub header
124{
125 my $self = shift;
126 return $self->{'header'} unless @_;
127 $self->{'header'}->header(@_);
128}
129
130sub as_string # legacy
131{
132 my $self = shift;
133 $self->{'header'}->as_string;
134}
135
136sub flush_text # internal
137{
138 my $self = shift;
139 my $tag = $self->{'tag'};
140 my $text = $self->{'text'};
141 $text =~ s/^\s+//;
142 $text =~ s/\s+$//;
143 $text =~ s/\s+/ /g;
144 print "FLUSH $tag => '$text'\n" if $DEBUG;
145 if ($tag eq 'title') {
146 HTML::Entities::decode($text);
147 $self->{'header'}->push_header(Title => $text);
148 }
149 $self->{'tag'} = $self->{'text'} = '';
150}
151
152# This is an quote from the HTML3.2 DTD which shows which elements
153# that might be present in a <HEAD>...</HEAD>. Also note that the
154# <HEAD> tags themselves might be missing:
155#
156# <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? &
157# SCRIPT* & META* & LINK*">
158#
159# <!ELEMENT HEAD O O (%head.content)>
160
161
162sub start
163{
164 my($self, $tag, $attr) = @_; # $attr is reference to a HASH
165 print "START[$tag]\n" if $DEBUG;
166 $self->flush_text if $self->{'tag'};
167 if ($tag eq 'meta') {
168 my $key = $attr->{'http-equiv'};
169 if (!defined($key) || !length($key)) {
170 return unless $attr->{'name'};
171 $key = "X-Meta-\u$attr->{'name'}";
172 }
173 $self->{'header'}->push_header($key => $attr->{content});
174 } elsif ($tag eq 'base') {
175 return unless exists $attr->{href};
176 $self->{'header'}->push_header('Content-Base' => $attr->{href});
177 } elsif ($tag eq 'isindex') {
178 # This is a non-standard header. Perhaps we should just ignore
179 # this element
180 $self->{'header'}->push_header(Isindex => $attr->{prompt} || '?');
181 } elsif ($tag =~ /^(?:title|script|style)$/) {
182 # Just remember tag. Initialize header when we see the end tag.
183 $self->{'tag'} = $tag;
184 } elsif ($tag eq 'link') {
185 return unless exists $attr->{href};
186 # <link href="http:..." rel="xxx" rev="xxx" title="xxx">
187 my $h_val = "<" . delete($attr->{href}) . ">";
188 for (sort keys %{$attr}) {
189 $h_val .= qq(; $_="$attr->{$_}");
190 }
191 $self->{'header'}->push_header(Link => $h_val);
192 } elsif ($tag eq 'head' || $tag eq 'html') {
193 # ignore
194 } else {
195 # stop parsing
196 $self->eof;
197 }
198}
199
200sub end
201{
202 my($self, $tag) = @_;
203 print "END[$tag]\n" if $DEBUG;
204 $self->flush_text if $self->{'tag'};
205 $self->eof if $tag eq 'head';
206}
207
208sub text
209{
210 my($self, $text) = @_;
211 $text =~ s/\x{FEFF}//; # drop Unicode BOM if found
212 print "TEXT[$text]\n" if $DEBUG;
213 my $tag = $self->{tag};
214 if (!$tag && $text =~ /\S/) {
215 # Normal text means start of body
216 $self->eof;
217 return;
218 }
219 return if $tag ne 'title';
220 $self->{'text'} .= $text;
221}
222
2231;
224
225__END__
226
227=back
228
229=head1 EXAMPLE
230
231 $h = HTTP::Headers->new;
232 $p = HTML::HeadParser->new($h);
233 $p->parse(<<EOT);
234 <title>Stupid example</title>
235 <base href="http://www.linpro.no/lwp/">
236 Normal text starts here.
237 EOT
238 undef $p;
239 print $h->title; # should print "Stupid example"
240
241=head1 SEE ALSO
242
243L<HTML::Parser>, L<HTTP::Headers>
244
245The C<HTTP::Headers> class is distributed as part of the
246I<libwww-perl> package. If you don't have that distribution installed
247you need to provide the $header argument to the C<HTML::HeadParser>
248constructor with your own object that implements the documented
249protocol.
250
251=head1 COPYRIGHT
252
253Copyright 1996-2001 Gisle Aas. All rights reserved.
254
255This library is free software; you can redistribute it and/or
256modify it under the same terms as Perl itself.
257
258=cut
259
Note: See TracBrowser for help on using the repository browser.