Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: gs2-extensions/parallel-building/trunk/src/perllib/cpan/HTML/HeadParser.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago
An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)
File size: 6.6 KB

Rev	Line
[24626]	1	package HTML::HeadParser;
	2
	3	=head1 NAME
	4
	5	HTML::HeadParser - Parse <HEAD> section of a HTML document
	6
	7	=head1 SYNOPSIS
	8
	9	require HTML::HeadParser;
	10	$p = HTML::HeadParser->new;
	11	$p->parse($text) and print "not finished";
	12
	13	$p->header('Title') # to access <title>....</title>
	14	$p->header('Content-Base') # to access <base href="http://...">
	15	$p->header('Foo') # to access <meta http-equiv="Foo" content="...">
	16
	17	=head1 DESCRIPTION
	18
	19	The C<HTML::HeadParser> is a specialized (and lightweight)
	20	C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD>
	21	section of an HTML document. The parse() method
	22	will return a FALSE value as soon as some E<lt>BODY> element or body
	23	text are found, and should not be called again after this.
	24
	25	Note that the C<HTML::HeadParser> might get confused if raw undecoded
	26	UTF-8 is passed to the parse() method. Make sure the strings are
	27	properly decoded before passing them on.
	28
	29	The C<HTML::HeadParser> keeps a reference to a header object, and the
	30	parser will update this header object as the various elements of the
	31	E<lt>HEAD> section of the HTML document are recognized. The following
	32	header fields are affected:
	33
	34	=over 4
	35
	36	=item Content-Base:
	37
	38	The I<Content-Base> header is initialized from the E<lt>base
	39	href="..."> element.
	40
	41	=item Title:
	42
	43	The I<Title> header is initialized from the E<lt>title>...E<lt>/title>
	44	element.
	45
	46	=item Isindex:
	47
	48	The I<Isindex> header will be added if there is a E<lt>isindex>
	49	element in the E<lt>head>. The header value is initialized from the
	50	I<prompt> attribute if it is present. If no I<prompt> attribute is
	51	given it will have '?' as the value.
	52
	53	=item X-Meta-Foo:
	54
	55	All E<lt>meta> elements will initialize headers with the prefix
	56	"C<X-Meta->" on the name. If the E<lt>meta> element contains a
	57	C<http-equiv> attribute, then it will be honored as the header name.
	58
	59	=back
	60
	61	=head1 METHODS
	62
	63	The following methods (in addition to those provided by the
	64	superclass) are available:
	65
	66	=over 4
	67
	68	=cut
	69
	70
	71	require HTML::Parser;
	72	@ISA = qw(HTML::Parser);
	73
	74	use HTML::Entities ();
	75
	76	use strict;
	77	use vars qw($VERSION $DEBUG);
	78	#$DEBUG = 1;
	79	$VERSION = sprintf("%d.%02d", q$Revision: 14078 $ =~ /(\d+)\.(\d+)/);
	80
	81	=item $hp = HTML::HeadParser->new
	82
	83	=item $hp = HTML::HeadParser->new( $header )
	84
	85	The object constructor. The optional $header argument should be a
	86	reference to an object that implement the header() and push_header()
	87	methods as defined by the C<HTTP::Headers> class. Normally it will be
	88	of some class that isa or delegates to the C<HTTP::Headers> class.
	89
	90	If no $header is given C<HTML::HeadParser> will create an
	91	C<HTTP::Header> object by itself (initially empty).
	92
	93	=cut
	94
	95	sub new
	96	{
	97	my($class, $header) = @_;
	98	unless ($header) {
	99	require HTTP::Headers;
	100	$header = HTTP::Headers->new;
	101	}
	102
	103	my $self = $class->SUPER::new(api_version => 2,
	104	ignore_elements => [qw(script style)],
	105	);
	106	$self->{'header'} = $header;
	107	$self->{'tag'} = ''; # name of active element that takes textual content
	108	$self->{'text'} = ''; # the accumulated text associated with the element
	109	$self;
	110	}
	111
	112	=item $hp->header;
	113
	114	Returns a reference to the header object.
	115
	116	=item $hp->header( $key )
	117
	118	Returns a header value. It is just a shorter way to write
	119	C<$hp-E<gt>header-E<gt>header($key)>.
	120
	121	=cut
	122
	123	sub header
	124	{
	125	my $self = shift;
	126	return $self->{'header'} unless @_;
	127	$self->{'header'}->header(@_);
	128	}
	129
	130	sub as_string # legacy
	131	{
	132	my $self = shift;
	133	$self->{'header'}->as_string;
	134	}
	135
	136	sub flush_text # internal
	137	{
	138	my $self = shift;
	139	my $tag = $self->{'tag'};
	140	my $text = $self->{'text'};
	141	$text =~ s/^\s+//;
	142	$text =~ s/\s+$//;
	143	$text =~ s/\s+/ /g;
	144	print "FLUSH $tag => '$text'\n" if $DEBUG;
	145	if ($tag eq 'title') {
	146	HTML::Entities::decode($text);
	147	$self->{'header'}->push_header(Title => $text);
	148	}
	149	$self->{'tag'} = $self->{'text'} = '';
	150	}
	151
	152	# This is an quote from the HTML3.2 DTD which shows which elements
	153	# that might be present in a <HEAD>...</HEAD>. Also note that the
	154	# <HEAD> tags themselves might be missing:
	155	#
	156	# <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? &
	157	# SCRIPT* & META* & LINK*">
	158	#
	159	# <!ELEMENT HEAD O O (%head.content)>
	160
	161
	162	sub start
	163	{
	164	my($self, $tag, $attr) = @_; # $attr is reference to a HASH
	165	print "START[$tag]\n" if $DEBUG;
	166	$self->flush_text if $self->{'tag'};
	167	if ($tag eq 'meta') {
	168	my $key = $attr->{'http-equiv'};
	169	if (!defined($key) \|\| !length($key)) {
	170	return unless $attr->{'name'};
	171	$key = "X-Meta-\u$attr->{'name'}";
	172	}
	173	$self->{'header'}->push_header($key => $attr->{content});
	174	} elsif ($tag eq 'base') {
	175	return unless exists $attr->{href};
	176	$self->{'header'}->push_header('Content-Base' => $attr->{href});
	177	} elsif ($tag eq 'isindex') {
	178	# This is a non-standard header. Perhaps we should just ignore
	179	# this element
	180	$self->{'header'}->push_header(Isindex => $attr->{prompt} \|\| '?');
	181	} elsif ($tag =~ /^(?:title\|script\|style)$/) {
	182	# Just remember tag. Initialize header when we see the end tag.
	183	$self->{'tag'} = $tag;
	184	} elsif ($tag eq 'link') {
	185	return unless exists $attr->{href};
	186	# <link href="http:..." rel="xxx" rev="xxx" title="xxx">
	187	my $h_val = "<" . delete($attr->{href}) . ">";
	188	for (sort keys %{$attr}) {
	189	$h_val .= qq(; $_="$attr->{$_}");
	190	}
	191	$self->{'header'}->push_header(Link => $h_val);
	192	} elsif ($tag eq 'head' \|\| $tag eq 'html') {
	193	# ignore
	194	} else {
	195	# stop parsing
	196	$self->eof;
	197	}
	198	}
	199
	200	sub end
	201	{
	202	my($self, $tag) = @_;
	203	print "END[$tag]\n" if $DEBUG;
	204	$self->flush_text if $self->{'tag'};
	205	$self->eof if $tag eq 'head';
	206	}
	207
	208	sub text
	209	{
	210	my($self, $text) = @_;
	211	$text =~ s/\x{FEFF}//; # drop Unicode BOM if found
	212	print "TEXT[$text]\n" if $DEBUG;
	213	my $tag = $self->{tag};
	214	if (!$tag && $text =~ /\S/) {
	215	# Normal text means start of body
	216	$self->eof;
	217	return;
	218	}
	219	return if $tag ne 'title';
	220	$self->{'text'} .= $text;
	221	}
	222
	223	1;
	224
	225	__END__
	226
	227	=back
	228
	229	=head1 EXAMPLE
	230
	231	$h = HTTP::Headers->new;
	232	$p = HTML::HeadParser->new($h);
	233	$p->parse(<<EOT);
	234	<title>Stupid example</title>
	235	<base href="http://www.linpro.no/lwp/">
	236	Normal text starts here.
	237	EOT
	238	undef $p;
	239	print $h->title; # should print "Stupid example"
	240
	241	=head1 SEE ALSO
	242
	243	L<HTML::Parser>, L<HTTP::Headers>
	244
	245	The C<HTTP::Headers> class is distributed as part of the
	246	I<libwww-perl> package. If you don't have that distribution installed
	247	you need to provide the $header argument to the C<HTML::HeadParser>
	248	constructor with your own object that implements the documented
	249	protocol.
	250
	251	=head1 COPYRIGHT
	252
	253	Copyright 1996-2001 Gisle Aas. All rights reserved.
	254
	255	This library is free software; you can redistribute it and/or
	256	modify it under the same terms as Perl itself.
	257
	258	=cut
	259

Note: See TracBrowser for help on using the repository browser.

Download in other formats: