Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: trunk/gsdl/perllib/cpan/HTML/TokeParser.pm@ 14078

Last change on this file since 14078 was 14078, checked in by lh92, 17 years ago
Perl modules required for HTMLTidy
Property svn:keywords set to `Author Date Id Revision`
File size: 9.8 KB

Rev	Line
[14078]	1	package HTML::TokeParser;
	2
	3	# $Id: TokeParser.pm 14078 2007-05-17 03:15:41Z lh92 $
	4
	5	require HTML::PullParser;
	6	@ISA=qw(HTML::PullParser);
	7	$VERSION = sprintf("%d.%02d", q$Revision: 14078 $ =~ /(\d+)\.(\d+)/);
	8
	9	use strict;
	10	use Carp ();
	11	use HTML::Entities qw(decode_entities);
	12	use HTML::Tagset ();
	13
	14	my %ARGS =
	15	(
	16	start => "'S',tagname,attr,attrseq,text",
	17	end => "'E',tagname,text",
	18	text => "'T',text,is_cdata",
	19	process => "'PI',token0,text",
	20	comment => "'C',text",
	21	declaration => "'D',text",
	22
	23	# options that default on
	24	unbroken_text => 1,
	25	);
	26
	27
	28	sub new
	29	{
	30	my $class = shift;
	31	my %cnf;
	32	if (@_ == 1) {
	33	my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
	34	%cnf = ($type => $_[0]);
	35	}
	36	else {
	37	%cnf = @_;
	38	}
	39
	40	my $textify = delete $cnf{textify} \|\| {img => "alt", applet => "alt"};
	41
	42	my $self = $class->SUPER::new(%cnf, %ARGS) \|\| return undef;
	43
	44	$self->{textify} = $textify;
	45	$self;
	46	}
	47
	48
	49	sub get_tag
	50	{
	51	my $self = shift;
	52	my $token;
	53	while (1) {
	54	$token = $self->get_token \|\| return undef;
	55	my $type = shift @$token;
	56	next unless $type eq "S" \|\| $type eq "E";
	57	substr($token->[0], 0, 0) = "/" if $type eq "E";
	58	return $token unless @_;
	59	for (@_) {
	60	return $token if $token->[0] eq $_;
	61	}
	62	}
	63	}
	64
	65
	66	sub _textify {
	67	my($self, $token) = @_;
	68	my $tag = $token->[1];
	69	return undef unless exists $self->{textify}{$tag};
	70
	71	my $alt = $self->{textify}{$tag};
	72	my $text;
	73	if (ref($alt)) {
	74	$text = &$alt(@$token);
	75	} else {
	76	$text = $token->[2]{$alt \|\| "alt"};
	77	$text = "[\U$tag]" unless defined $text;
	78	}
	79	return $text;
	80	}
	81
	82
	83	sub get_text
	84	{
	85	my $self = shift;
	86	my @text;
	87	while (my $token = $self->get_token) {
	88	my $type = $token->[0];
	89	if ($type eq "T") {
	90	my $text = $token->[1];
	91	decode_entities($text) unless $token->[2];
	92	push(@text, $text);
	93	} elsif ($type =~ /^[SE]$/) {
	94	my $tag = $token->[1];
	95	if ($type eq "S") {
	96	if (defined(my $text = _textify($self, $token))) {
	97	push(@text, $text);
	98	next;
	99	}
	100	} else {
	101	$tag = "/$tag";
	102	}
	103	if (!@_ \|\| grep $_ eq $tag, @_) {
	104	$self->unget_token($token);
	105	last;
	106	}
	107	push(@text, " ")
	108	if $tag eq "br" \|\| !$HTML::Tagset::isPhraseMarkup{$token->[1]};
	109	}
	110	}
	111	join("", @text);
	112	}
	113
	114
	115	sub get_trimmed_text
	116	{
	117	my $self = shift;
	118	my $text = $self->get_text(@_);
	119	$text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
	120	$text;
	121	}
	122
	123	sub get_phrase {
	124	my $self = shift;
	125	my @text;
	126	while (my $token = $self->get_token) {
	127	my $type = $token->[0];
	128	if ($type eq "T") {
	129	my $text = $token->[1];
	130	decode_entities($text) unless $token->[2];
	131	push(@text, $text);
	132	} elsif ($type =~ /^[SE]$/) {
	133	my $tag = $token->[1];
	134	if ($type eq "S") {
	135	if (defined(my $text = _textify($self, $token))) {
	136	push(@text, $text);
	137	next;
	138	}
	139	}
	140	if (!$HTML::Tagset::isPhraseMarkup{$tag}) {
	141	$self->unget_token($token);
	142	last;
	143	}
	144	push(@text, " ") if $tag eq "br";
	145	}
	146	}
	147	my $text = join("", @text);
	148	$text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
	149	$text;
	150	}
	151
	152	1;
	153
	154
	155	__END__
	156
	157	=head1 NAME
	158
	159	HTML::TokeParser - Alternative HTML::Parser interface
	160
	161	=head1 SYNOPSIS
	162
	163	require HTML::TokeParser;
	164	$p = HTML::TokeParser->new("index.html") \|\|
	165	die "Can't open: $!";
	166	$p->empty_element_tags(1); # configure its behaviour
	167
	168	while (my $token = $p->get_token) {
	169	#...
	170	}
	171
	172	=head1 DESCRIPTION
	173
	174	The C<HTML::TokeParser> is an alternative interface to the
	175	C<HTML::Parser> class. It is an C<HTML::PullParser> subclass with a
	176	predeclared set of token types. If you wish the tokens to be reported
	177	differently you probably want to use the C<HTML::PullParser> directly.
	178
	179	The following methods are available:
	180
	181	=over 4
	182
	183	=item $p = HTML::TokeParser->new( $filename, %opt );
	184
	185	=item $p = HTML::TokeParser->new( $filehandle, %opt );
	186
	187	=item $p = HTML::TokeParser->new( \$document, %opt );
	188
	189	The object constructor argument is either a file name, a file handle
	190	object, or the complete document to be parsed. Extra options can be
	191	provided as key/value pairs and are processed as documented by the base
	192	classes.
	193
	194	If the argument is a plain scalar, then it is taken as the name of a
	195	file to be opened and parsed. If the file can't be opened for
	196	reading, then the constructor will return C<undef> and $! will tell
	197	you why it failed.
	198
	199	If the argument is a reference to a plain scalar, then this scalar is
	200	taken to be the literal document to parse. The value of this
	201	scalar should not be changed before all tokens have been extracted.
	202
	203	Otherwise the argument is taken to be some object that the
	204	C<HTML::TokeParser> can read() from when it needs more data. Typically
	205	it will be a filehandle of some kind. The stream will be read() until
	206	EOF, but not closed.
	207
	208	A newly constructed C<HTML::TokeParser> differ from its base classes
	209	by having the C<unbroken_text> attribute enabled by default. See
	210	L<HTML::Parser> for a description of this and other attributes that
	211	influence how the document is parsed. It is often a good idea to enable
	212	C<empty_element_tags> behaviour.
	213
	214	Note that the parsing result will likely not be valid if raw undecoded
	215	UTF-8 is used as a source. When parsing UTF-8 encoded files turn
	216	on UTF-8 decoding:
	217
	218	open(my $fh, "<:utf8", "index.html") \|\| die "Can't open 'index.html': $!";
	219	my $p = HTML::TokeParser->new( $fh );
	220	# ...
	221
	222	If a $filename is passed to the constructor the file will be opened in
	223	raw mode and the parsing result will only be valid if its content is
	224	Latin-1 or pure ASCII.
	225
	226	If parsing from an UTF-8 encoded string buffer decode it first:
	227
	228	utf8::decode($document);
	229	my $p = HTML::TokeParser->new( \$document );
	230	# ...
	231
	232	=item $p->get_token
	233
	234	This method will return the next I<token> found in the HTML document,
	235	or C<undef> at the end of the document. The token is returned as an
	236	array reference. The first element of the array will be a string
	237	denoting the type of this token: "S" for start tag, "E" for end tag,
	238	"T" for text, "C" for comment, "D" for declaration, and "PI" for
	239	process instructions. The rest of the token array depend on the type
	240	like this:
	241
	242	["S", $tag, $attr, $attrseq, $text]
	243	["E", $tag, $text]
	244	["T", $text, $is_data]
	245	["C", $text]
	246	["D", $text]
	247	["PI", $token0, $text]
	248
	249	where $attr is a hash reference, $attrseq is an array reference and
	250	the rest are plain scalars. The L<HTML::Parser/Argspec> explains the
	251	details.
	252
	253	=item $p->unget_token( @tokens )
	254
	255	If you find you have read too many tokens you can push them back,
	256	so that they are returned the next time $p->get_token is called.
	257
	258	=item $p->get_tag
	259
	260	=item $p->get_tag( @tags )
	261
	262	This method returns the next start or end tag (skipping any other
	263	tokens), or C<undef> if there are no more tags in the document. If
	264	one or more arguments are given, then we skip tokens until one of the
	265	specified tag types is found. For example:
	266
	267	$p->get_tag("font", "/font");
	268
	269	will find the next start or end tag for a font-element.
	270
	271	The tag information is returned as an array reference in the same form
	272	as for $p->get_token above, but the type code (first element) is
	273	missing. A start tag will be returned like this:
	274
	275	[$tag, $attr, $attrseq, $text]
	276
	277	The tagname of end tags are prefixed with "/", i.e. end tag is
	278	returned like this:
	279
	280	["/$tag", $text]
	281
	282	=item $p->get_text
	283
	284	=item $p->get_text( @endtags )
	285
	286	This method returns all text found at the current position. It will
	287	return a zero length string if the next token is not text. Any
	288	entities will be converted to their corresponding character.
	289
	290	If one or more arguments are given, then we return all text occurring
	291	before the first of the specified tags found. For example:
	292
	293	$p->get_text("p", "br");
	294
	295	will return the text up to either a paragraph of linebreak element.
	296
	297	The text might span tags that should be I<textified>. This is
	298	controlled by the $p->{textify} attribute, which is a hash that
	299	defines how certain tags can be treated as text. If the name of a
	300	start tag matches a key in this hash then this tag is converted to
	301	text. The hash value is used to specify which tag attribute to obtain
	302	the text from. If this tag attribute is missing, then the upper case
	303	name of the tag enclosed in brackets is returned, e.g. "[IMG]". The
	304	hash value can also be a subroutine reference. In this case the
	305	routine is called with the start tag token content as its argument and
	306	the return value is treated as the text.
	307
	308	The default $p->{textify} value is:
	309
	310	{img => "alt", applet => "alt"}
	311
	312	This means that <IMG> and <APPLET> tags are treated as text, and that
	313	the text to substitute can be found in the ALT attribute.
	314
	315	=item $p->get_trimmed_text
	316
	317	=item $p->get_trimmed_text( @endtags )
	318
	319	Same as $p->get_text above, but will collapse any sequences of white
	320	space to a single space character. Leading and trailing white space is
	321	removed.
	322
	323	=item $p->get_phrase
	324
	325	This will return all text found at the current position ignoring any
	326	phrasal-level tags. Text is extracted until the first non
	327	phrasal-level tag. Textification of tags is the same as for
	328	get_text(). This method will collapse white space in the same way as
	329	get_trimmed_text() does.
	330
	331	The definition of <i>phrasal-level tags</i> is obtained from the
	332	HTML::Tagset module.
	333
	334	=back
	335
	336	=head1 EXAMPLES
	337
	338	This example extracts all links from a document. It will print one
	339	line for each link, containing the URL and the textual description
	340	between the <A>...</A> tags:
	341
	342	use HTML::TokeParser;
	343	$p = HTML::TokeParser->new(shift\|\|"index.html");
	344
	345	while (my $token = $p->get_tag("a")) {
	346	my $url = $token->[1]{href} \|\| "-";
	347	my $text = $p->get_trimmed_text("/a");
	348	print "$url\t$text\n";
	349	}
	350
	351	This example extract the <TITLE> from the document:
	352
	353	use HTML::TokeParser;
	354	$p = HTML::TokeParser->new(shift\|\|"index.html");
	355	if ($p->get_tag("title")) {
	356	my $title = $p->get_trimmed_text;
	357	print "Title: $title\n";
	358	}
	359
	360	=head1 SEE ALSO
	361
	362	L<HTML::PullParser>, L<HTML::Parser>
	363
	364	=head1 COPYRIGHT
	365
	366	Copyright 1998-2005 Gisle Aas.
	367
	368	This library is free software; you can redistribute it and/or
	369	modify it under the same terms as Perl itself.
	370
	371	=cut

Note: See TracBrowser for help on using the repository browser.

Download in other formats: