[14078] | 1 | package HTML::TokeParser;
|
---|
| 2 |
|
---|
| 3 | # $Id: TokeParser.pm 14078 2007-05-17 03:15:41Z lh92 $
|
---|
| 4 |
|
---|
| 5 | require HTML::PullParser;
|
---|
| 6 | @ISA=qw(HTML::PullParser);
|
---|
| 7 | $VERSION = sprintf("%d.%02d", q$Revision: 14078 $ =~ /(\d+)\.(\d+)/);
|
---|
| 8 |
|
---|
| 9 | use strict;
|
---|
| 10 | use Carp ();
|
---|
| 11 | use HTML::Entities qw(decode_entities);
|
---|
| 12 | use HTML::Tagset ();
|
---|
| 13 |
|
---|
| 14 | my %ARGS =
|
---|
| 15 | (
|
---|
| 16 | start => "'S',tagname,attr,attrseq,text",
|
---|
| 17 | end => "'E',tagname,text",
|
---|
| 18 | text => "'T',text,is_cdata",
|
---|
| 19 | process => "'PI',token0,text",
|
---|
| 20 | comment => "'C',text",
|
---|
| 21 | declaration => "'D',text",
|
---|
| 22 |
|
---|
| 23 | # options that default on
|
---|
| 24 | unbroken_text => 1,
|
---|
| 25 | );
|
---|
| 26 |
|
---|
| 27 |
|
---|
| 28 | sub new
|
---|
| 29 | {
|
---|
| 30 | my $class = shift;
|
---|
| 31 | my %cnf;
|
---|
| 32 | if (@_ == 1) {
|
---|
| 33 | my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
|
---|
| 34 | %cnf = ($type => $_[0]);
|
---|
| 35 | }
|
---|
| 36 | else {
|
---|
| 37 | %cnf = @_;
|
---|
| 38 | }
|
---|
| 39 |
|
---|
| 40 | my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
|
---|
| 41 |
|
---|
| 42 | my $self = $class->SUPER::new(%cnf, %ARGS) || return undef;
|
---|
| 43 |
|
---|
| 44 | $self->{textify} = $textify;
|
---|
| 45 | $self;
|
---|
| 46 | }
|
---|
| 47 |
|
---|
| 48 |
|
---|
| 49 | sub get_tag
|
---|
| 50 | {
|
---|
| 51 | my $self = shift;
|
---|
| 52 | my $token;
|
---|
| 53 | while (1) {
|
---|
| 54 | $token = $self->get_token || return undef;
|
---|
| 55 | my $type = shift @$token;
|
---|
| 56 | next unless $type eq "S" || $type eq "E";
|
---|
| 57 | substr($token->[0], 0, 0) = "/" if $type eq "E";
|
---|
| 58 | return $token unless @_;
|
---|
| 59 | for (@_) {
|
---|
| 60 | return $token if $token->[0] eq $_;
|
---|
| 61 | }
|
---|
| 62 | }
|
---|
| 63 | }
|
---|
| 64 |
|
---|
| 65 |
|
---|
| 66 | sub _textify {
|
---|
| 67 | my($self, $token) = @_;
|
---|
| 68 | my $tag = $token->[1];
|
---|
| 69 | return undef unless exists $self->{textify}{$tag};
|
---|
| 70 |
|
---|
| 71 | my $alt = $self->{textify}{$tag};
|
---|
| 72 | my $text;
|
---|
| 73 | if (ref($alt)) {
|
---|
| 74 | $text = &$alt(@$token);
|
---|
| 75 | } else {
|
---|
| 76 | $text = $token->[2]{$alt || "alt"};
|
---|
| 77 | $text = "[\U$tag]" unless defined $text;
|
---|
| 78 | }
|
---|
| 79 | return $text;
|
---|
| 80 | }
|
---|
| 81 |
|
---|
| 82 |
|
---|
| 83 | sub get_text
|
---|
| 84 | {
|
---|
| 85 | my $self = shift;
|
---|
| 86 | my @text;
|
---|
| 87 | while (my $token = $self->get_token) {
|
---|
| 88 | my $type = $token->[0];
|
---|
| 89 | if ($type eq "T") {
|
---|
| 90 | my $text = $token->[1];
|
---|
| 91 | decode_entities($text) unless $token->[2];
|
---|
| 92 | push(@text, $text);
|
---|
| 93 | } elsif ($type =~ /^[SE]$/) {
|
---|
| 94 | my $tag = $token->[1];
|
---|
| 95 | if ($type eq "S") {
|
---|
| 96 | if (defined(my $text = _textify($self, $token))) {
|
---|
| 97 | push(@text, $text);
|
---|
| 98 | next;
|
---|
| 99 | }
|
---|
| 100 | } else {
|
---|
| 101 | $tag = "/$tag";
|
---|
| 102 | }
|
---|
| 103 | if (!@_ || grep $_ eq $tag, @_) {
|
---|
| 104 | $self->unget_token($token);
|
---|
| 105 | last;
|
---|
| 106 | }
|
---|
| 107 | push(@text, " ")
|
---|
| 108 | if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]};
|
---|
| 109 | }
|
---|
| 110 | }
|
---|
| 111 | join("", @text);
|
---|
| 112 | }
|
---|
| 113 |
|
---|
| 114 |
|
---|
| 115 | sub get_trimmed_text
|
---|
| 116 | {
|
---|
| 117 | my $self = shift;
|
---|
| 118 | my $text = $self->get_text(@_);
|
---|
| 119 | $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
|
---|
| 120 | $text;
|
---|
| 121 | }
|
---|
| 122 |
|
---|
| 123 | sub get_phrase {
|
---|
| 124 | my $self = shift;
|
---|
| 125 | my @text;
|
---|
| 126 | while (my $token = $self->get_token) {
|
---|
| 127 | my $type = $token->[0];
|
---|
| 128 | if ($type eq "T") {
|
---|
| 129 | my $text = $token->[1];
|
---|
| 130 | decode_entities($text) unless $token->[2];
|
---|
| 131 | push(@text, $text);
|
---|
| 132 | } elsif ($type =~ /^[SE]$/) {
|
---|
| 133 | my $tag = $token->[1];
|
---|
| 134 | if ($type eq "S") {
|
---|
| 135 | if (defined(my $text = _textify($self, $token))) {
|
---|
| 136 | push(@text, $text);
|
---|
| 137 | next;
|
---|
| 138 | }
|
---|
| 139 | }
|
---|
| 140 | if (!$HTML::Tagset::isPhraseMarkup{$tag}) {
|
---|
| 141 | $self->unget_token($token);
|
---|
| 142 | last;
|
---|
| 143 | }
|
---|
| 144 | push(@text, " ") if $tag eq "br";
|
---|
| 145 | }
|
---|
| 146 | }
|
---|
| 147 | my $text = join("", @text);
|
---|
| 148 | $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
|
---|
| 149 | $text;
|
---|
| 150 | }
|
---|
| 151 |
|
---|
| 152 | 1;
|
---|
| 153 |
|
---|
| 154 |
|
---|
| 155 | __END__
|
---|
| 156 |
|
---|
| 157 | =head1 NAME
|
---|
| 158 |
|
---|
| 159 | HTML::TokeParser - Alternative HTML::Parser interface
|
---|
| 160 |
|
---|
| 161 | =head1 SYNOPSIS
|
---|
| 162 |
|
---|
| 163 | require HTML::TokeParser;
|
---|
| 164 | $p = HTML::TokeParser->new("index.html") ||
|
---|
| 165 | die "Can't open: $!";
|
---|
| 166 | $p->empty_element_tags(1); # configure its behaviour
|
---|
| 167 |
|
---|
| 168 | while (my $token = $p->get_token) {
|
---|
| 169 | #...
|
---|
| 170 | }
|
---|
| 171 |
|
---|
| 172 | =head1 DESCRIPTION
|
---|
| 173 |
|
---|
| 174 | The C<HTML::TokeParser> is an alternative interface to the
|
---|
| 175 | C<HTML::Parser> class. It is an C<HTML::PullParser> subclass with a
|
---|
| 176 | predeclared set of token types. If you wish the tokens to be reported
|
---|
| 177 | differently you probably want to use the C<HTML::PullParser> directly.
|
---|
| 178 |
|
---|
| 179 | The following methods are available:
|
---|
| 180 |
|
---|
| 181 | =over 4
|
---|
| 182 |
|
---|
| 183 | =item $p = HTML::TokeParser->new( $filename, %opt );
|
---|
| 184 |
|
---|
| 185 | =item $p = HTML::TokeParser->new( $filehandle, %opt );
|
---|
| 186 |
|
---|
| 187 | =item $p = HTML::TokeParser->new( \$document, %opt );
|
---|
| 188 |
|
---|
| 189 | The object constructor argument is either a file name, a file handle
|
---|
| 190 | object, or the complete document to be parsed. Extra options can be
|
---|
| 191 | provided as key/value pairs and are processed as documented by the base
|
---|
| 192 | classes.
|
---|
| 193 |
|
---|
| 194 | If the argument is a plain scalar, then it is taken as the name of a
|
---|
| 195 | file to be opened and parsed. If the file can't be opened for
|
---|
| 196 | reading, then the constructor will return C<undef> and $! will tell
|
---|
| 197 | you why it failed.
|
---|
| 198 |
|
---|
| 199 | If the argument is a reference to a plain scalar, then this scalar is
|
---|
| 200 | taken to be the literal document to parse. The value of this
|
---|
| 201 | scalar should not be changed before all tokens have been extracted.
|
---|
| 202 |
|
---|
| 203 | Otherwise the argument is taken to be some object that the
|
---|
| 204 | C<HTML::TokeParser> can read() from when it needs more data. Typically
|
---|
| 205 | it will be a filehandle of some kind. The stream will be read() until
|
---|
| 206 | EOF, but not closed.
|
---|
| 207 |
|
---|
| 208 | A newly constructed C<HTML::TokeParser> differ from its base classes
|
---|
| 209 | by having the C<unbroken_text> attribute enabled by default. See
|
---|
| 210 | L<HTML::Parser> for a description of this and other attributes that
|
---|
| 211 | influence how the document is parsed. It is often a good idea to enable
|
---|
| 212 | C<empty_element_tags> behaviour.
|
---|
| 213 |
|
---|
| 214 | Note that the parsing result will likely not be valid if raw undecoded
|
---|
| 215 | UTF-8 is used as a source. When parsing UTF-8 encoded files turn
|
---|
| 216 | on UTF-8 decoding:
|
---|
| 217 |
|
---|
| 218 | open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!";
|
---|
| 219 | my $p = HTML::TokeParser->new( $fh );
|
---|
| 220 | # ...
|
---|
| 221 |
|
---|
| 222 | If a $filename is passed to the constructor the file will be opened in
|
---|
| 223 | raw mode and the parsing result will only be valid if its content is
|
---|
| 224 | Latin-1 or pure ASCII.
|
---|
| 225 |
|
---|
| 226 | If parsing from an UTF-8 encoded string buffer decode it first:
|
---|
| 227 |
|
---|
| 228 | utf8::decode($document);
|
---|
| 229 | my $p = HTML::TokeParser->new( \$document );
|
---|
| 230 | # ...
|
---|
| 231 |
|
---|
| 232 | =item $p->get_token
|
---|
| 233 |
|
---|
| 234 | This method will return the next I<token> found in the HTML document,
|
---|
| 235 | or C<undef> at the end of the document. The token is returned as an
|
---|
| 236 | array reference. The first element of the array will be a string
|
---|
| 237 | denoting the type of this token: "S" for start tag, "E" for end tag,
|
---|
| 238 | "T" for text, "C" for comment, "D" for declaration, and "PI" for
|
---|
| 239 | process instructions. The rest of the token array depend on the type
|
---|
| 240 | like this:
|
---|
| 241 |
|
---|
| 242 | ["S", $tag, $attr, $attrseq, $text]
|
---|
| 243 | ["E", $tag, $text]
|
---|
| 244 | ["T", $text, $is_data]
|
---|
| 245 | ["C", $text]
|
---|
| 246 | ["D", $text]
|
---|
| 247 | ["PI", $token0, $text]
|
---|
| 248 |
|
---|
| 249 | where $attr is a hash reference, $attrseq is an array reference and
|
---|
| 250 | the rest are plain scalars. The L<HTML::Parser/Argspec> explains the
|
---|
| 251 | details.
|
---|
| 252 |
|
---|
| 253 | =item $p->unget_token( @tokens )
|
---|
| 254 |
|
---|
| 255 | If you find you have read too many tokens you can push them back,
|
---|
| 256 | so that they are returned the next time $p->get_token is called.
|
---|
| 257 |
|
---|
| 258 | =item $p->get_tag
|
---|
| 259 |
|
---|
| 260 | =item $p->get_tag( @tags )
|
---|
| 261 |
|
---|
| 262 | This method returns the next start or end tag (skipping any other
|
---|
| 263 | tokens), or C<undef> if there are no more tags in the document. If
|
---|
| 264 | one or more arguments are given, then we skip tokens until one of the
|
---|
| 265 | specified tag types is found. For example:
|
---|
| 266 |
|
---|
| 267 | $p->get_tag("font", "/font");
|
---|
| 268 |
|
---|
| 269 | will find the next start or end tag for a font-element.
|
---|
| 270 |
|
---|
| 271 | The tag information is returned as an array reference in the same form
|
---|
| 272 | as for $p->get_token above, but the type code (first element) is
|
---|
| 273 | missing. A start tag will be returned like this:
|
---|
| 274 |
|
---|
| 275 | [$tag, $attr, $attrseq, $text]
|
---|
| 276 |
|
---|
| 277 | The tagname of end tags are prefixed with "/", i.e. end tag is
|
---|
| 278 | returned like this:
|
---|
| 279 |
|
---|
| 280 | ["/$tag", $text]
|
---|
| 281 |
|
---|
| 282 | =item $p->get_text
|
---|
| 283 |
|
---|
| 284 | =item $p->get_text( @endtags )
|
---|
| 285 |
|
---|
| 286 | This method returns all text found at the current position. It will
|
---|
| 287 | return a zero length string if the next token is not text. Any
|
---|
| 288 | entities will be converted to their corresponding character.
|
---|
| 289 |
|
---|
| 290 | If one or more arguments are given, then we return all text occurring
|
---|
| 291 | before the first of the specified tags found. For example:
|
---|
| 292 |
|
---|
| 293 | $p->get_text("p", "br");
|
---|
| 294 |
|
---|
| 295 | will return the text up to either a paragraph of linebreak element.
|
---|
| 296 |
|
---|
| 297 | The text might span tags that should be I<textified>. This is
|
---|
| 298 | controlled by the $p->{textify} attribute, which is a hash that
|
---|
| 299 | defines how certain tags can be treated as text. If the name of a
|
---|
| 300 | start tag matches a key in this hash then this tag is converted to
|
---|
| 301 | text. The hash value is used to specify which tag attribute to obtain
|
---|
| 302 | the text from. If this tag attribute is missing, then the upper case
|
---|
| 303 | name of the tag enclosed in brackets is returned, e.g. "[IMG]". The
|
---|
| 304 | hash value can also be a subroutine reference. In this case the
|
---|
| 305 | routine is called with the start tag token content as its argument and
|
---|
| 306 | the return value is treated as the text.
|
---|
| 307 |
|
---|
| 308 | The default $p->{textify} value is:
|
---|
| 309 |
|
---|
| 310 | {img => "alt", applet => "alt"}
|
---|
| 311 |
|
---|
| 312 | This means that <IMG> and <APPLET> tags are treated as text, and that
|
---|
| 313 | the text to substitute can be found in the ALT attribute.
|
---|
| 314 |
|
---|
| 315 | =item $p->get_trimmed_text
|
---|
| 316 |
|
---|
| 317 | =item $p->get_trimmed_text( @endtags )
|
---|
| 318 |
|
---|
| 319 | Same as $p->get_text above, but will collapse any sequences of white
|
---|
| 320 | space to a single space character. Leading and trailing white space is
|
---|
| 321 | removed.
|
---|
| 322 |
|
---|
| 323 | =item $p->get_phrase
|
---|
| 324 |
|
---|
| 325 | This will return all text found at the current position ignoring any
|
---|
| 326 | phrasal-level tags. Text is extracted until the first non
|
---|
| 327 | phrasal-level tag. Textification of tags is the same as for
|
---|
| 328 | get_text(). This method will collapse white space in the same way as
|
---|
| 329 | get_trimmed_text() does.
|
---|
| 330 |
|
---|
| 331 | The definition of <i>phrasal-level tags</i> is obtained from the
|
---|
| 332 | HTML::Tagset module.
|
---|
| 333 |
|
---|
| 334 | =back
|
---|
| 335 |
|
---|
| 336 | =head1 EXAMPLES
|
---|
| 337 |
|
---|
| 338 | This example extracts all links from a document. It will print one
|
---|
| 339 | line for each link, containing the URL and the textual description
|
---|
| 340 | between the <A>...</A> tags:
|
---|
| 341 |
|
---|
| 342 | use HTML::TokeParser;
|
---|
| 343 | $p = HTML::TokeParser->new(shift||"index.html");
|
---|
| 344 |
|
---|
| 345 | while (my $token = $p->get_tag("a")) {
|
---|
| 346 | my $url = $token->[1]{href} || "-";
|
---|
| 347 | my $text = $p->get_trimmed_text("/a");
|
---|
| 348 | print "$url\t$text\n";
|
---|
| 349 | }
|
---|
| 350 |
|
---|
| 351 | This example extract the <TITLE> from the document:
|
---|
| 352 |
|
---|
| 353 | use HTML::TokeParser;
|
---|
| 354 | $p = HTML::TokeParser->new(shift||"index.html");
|
---|
| 355 | if ($p->get_tag("title")) {
|
---|
| 356 | my $title = $p->get_trimmed_text;
|
---|
| 357 | print "Title: $title\n";
|
---|
| 358 | }
|
---|
| 359 |
|
---|
| 360 | =head1 SEE ALSO
|
---|
| 361 |
|
---|
| 362 | L<HTML::PullParser>, L<HTML::Parser>
|
---|
| 363 |
|
---|
| 364 | =head1 COPYRIGHT
|
---|
| 365 |
|
---|
| 366 | Copyright 1998-2005 Gisle Aas.
|
---|
| 367 |
|
---|
| 368 | This library is free software; you can redistribute it and/or
|
---|
| 369 | modify it under the same terms as Perl itself.
|
---|
| 370 |
|
---|
| 371 | =cut
|
---|