1 | package HTML::TokeParser;
|
---|
2 |
|
---|
3 | # $Id: TokeParser.pm 22649 2010-08-17 02:03:34Z davidb $
|
---|
4 |
|
---|
5 | require HTML::PullParser;
|
---|
6 | @ISA=qw(HTML::PullParser);
|
---|
7 | #$VERSION = sprintf("%d.%02d", q$Revision: 22649 $ =~ /(\d+)\.(\d+)/);
|
---|
8 |
|
---|
9 | use strict;
|
---|
10 | use Carp ();
|
---|
11 | use HTML::Entities qw(decode_entities);
|
---|
12 | use HTML::Tagset ();
|
---|
13 |
|
---|
14 | my %ARGS =
|
---|
15 | (
|
---|
16 | start => "'S',tagname,attr,attrseq,text",
|
---|
17 | end => "'E',tagname,text",
|
---|
18 | text => "'T',text,is_cdata",
|
---|
19 | process => "'PI',token0,text",
|
---|
20 | comment => "'C',text",
|
---|
21 | declaration => "'D',text",
|
---|
22 |
|
---|
23 | # options that default on
|
---|
24 | unbroken_text => 1,
|
---|
25 | );
|
---|
26 |
|
---|
27 |
|
---|
28 | sub new
|
---|
29 | {
|
---|
30 | my $class = shift;
|
---|
31 | my %cnf;
|
---|
32 | if (@_ == 1) {
|
---|
33 | my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
|
---|
34 | %cnf = ($type => $_[0]);
|
---|
35 | }
|
---|
36 | else {
|
---|
37 | %cnf = @_;
|
---|
38 | }
|
---|
39 |
|
---|
40 | my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
|
---|
41 |
|
---|
42 | my $self = $class->SUPER::new(%cnf, %ARGS) || return undef;
|
---|
43 |
|
---|
44 | $self->{textify} = $textify;
|
---|
45 | $self;
|
---|
46 | }
|
---|
47 |
|
---|
48 |
|
---|
49 | sub get_tag
|
---|
50 | {
|
---|
51 | my $self = shift;
|
---|
52 | my $token;
|
---|
53 | while (1) {
|
---|
54 | $token = $self->get_token || return undef;
|
---|
55 | my $type = shift @$token;
|
---|
56 | next unless $type eq "S" || $type eq "E";
|
---|
57 | substr($token->[0], 0, 0) = "/" if $type eq "E";
|
---|
58 | return $token unless @_;
|
---|
59 | for (@_) {
|
---|
60 | return $token if $token->[0] eq $_;
|
---|
61 | }
|
---|
62 | }
|
---|
63 | }
|
---|
64 |
|
---|
65 |
|
---|
66 | sub _textify {
|
---|
67 | my($self, $token) = @_;
|
---|
68 | my $tag = $token->[1];
|
---|
69 | return undef unless exists $self->{textify}{$tag};
|
---|
70 |
|
---|
71 | my $alt = $self->{textify}{$tag};
|
---|
72 | my $text;
|
---|
73 | if (ref($alt)) {
|
---|
74 | $text = &$alt(@$token);
|
---|
75 | } else {
|
---|
76 | $text = $token->[2]{$alt || "alt"};
|
---|
77 | $text = "[\U$tag]" unless defined $text;
|
---|
78 | }
|
---|
79 | return $text;
|
---|
80 | }
|
---|
81 |
|
---|
82 |
|
---|
83 | sub get_text
|
---|
84 | {
|
---|
85 | my $self = shift;
|
---|
86 | my @text;
|
---|
87 | while (my $token = $self->get_token) {
|
---|
88 | my $type = $token->[0];
|
---|
89 | if ($type eq "T") {
|
---|
90 | my $text = $token->[1];
|
---|
91 | decode_entities($text) unless $token->[2];
|
---|
92 | push(@text, $text);
|
---|
93 | } elsif ($type =~ /^[SE]$/) {
|
---|
94 | my $tag = $token->[1];
|
---|
95 | if ($type eq "S") {
|
---|
96 | if (defined(my $text = _textify($self, $token))) {
|
---|
97 | push(@text, $text);
|
---|
98 | next;
|
---|
99 | }
|
---|
100 | } else {
|
---|
101 | $tag = "/$tag";
|
---|
102 | }
|
---|
103 | if (!@_ || grep $_ eq $tag, @_) {
|
---|
104 | $self->unget_token($token);
|
---|
105 | last;
|
---|
106 | }
|
---|
107 | push(@text, " ")
|
---|
108 | if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]};
|
---|
109 | }
|
---|
110 | }
|
---|
111 | join("", @text);
|
---|
112 | }
|
---|
113 |
|
---|
114 |
|
---|
115 | sub get_trimmed_text
|
---|
116 | {
|
---|
117 | my $self = shift;
|
---|
118 | my $text = $self->get_text(@_);
|
---|
119 | $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
|
---|
120 | $text;
|
---|
121 | }
|
---|
122 |
|
---|
123 | sub get_phrase {
|
---|
124 | my $self = shift;
|
---|
125 | my @text;
|
---|
126 | while (my $token = $self->get_token) {
|
---|
127 | my $type = $token->[0];
|
---|
128 | if ($type eq "T") {
|
---|
129 | my $text = $token->[1];
|
---|
130 | decode_entities($text) unless $token->[2];
|
---|
131 | push(@text, $text);
|
---|
132 | } elsif ($type =~ /^[SE]$/) {
|
---|
133 | my $tag = $token->[1];
|
---|
134 | if ($type eq "S") {
|
---|
135 | if (defined(my $text = _textify($self, $token))) {
|
---|
136 | push(@text, $text);
|
---|
137 | next;
|
---|
138 | }
|
---|
139 | }
|
---|
140 | if (!$HTML::Tagset::isPhraseMarkup{$tag}) {
|
---|
141 | $self->unget_token($token);
|
---|
142 | last;
|
---|
143 | }
|
---|
144 | push(@text, " ") if $tag eq "br";
|
---|
145 | }
|
---|
146 | }
|
---|
147 | my $text = join("", @text);
|
---|
148 | $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
|
---|
149 | $text;
|
---|
150 | }
|
---|
151 |
|
---|
152 | 1;
|
---|
153 |
|
---|
154 |
|
---|
155 | __END__
|
---|
156 |
|
---|
157 | =head1 NAME
|
---|
158 |
|
---|
159 | HTML::TokeParser - Alternative HTML::Parser interface
|
---|
160 |
|
---|
161 | =head1 SYNOPSIS
|
---|
162 |
|
---|
163 | require HTML::TokeParser;
|
---|
164 | $p = HTML::TokeParser->new("index.html") ||
|
---|
165 | die "Can't open: $!";
|
---|
166 | $p->empty_element_tags(1); # configure its behaviour
|
---|
167 |
|
---|
168 | while (my $token = $p->get_token) {
|
---|
169 | #...
|
---|
170 | }
|
---|
171 |
|
---|
172 | =head1 DESCRIPTION
|
---|
173 |
|
---|
174 | The C<HTML::TokeParser> is an alternative interface to the
|
---|
175 | C<HTML::Parser> class. It is an C<HTML::PullParser> subclass with a
|
---|
176 | predeclared set of token types. If you wish the tokens to be reported
|
---|
177 | differently you probably want to use the C<HTML::PullParser> directly.
|
---|
178 |
|
---|
179 | The following methods are available:
|
---|
180 |
|
---|
181 | =over 4
|
---|
182 |
|
---|
183 | =item $p = HTML::TokeParser->new( $filename, %opt );
|
---|
184 |
|
---|
185 | =item $p = HTML::TokeParser->new( $filehandle, %opt );
|
---|
186 |
|
---|
187 | =item $p = HTML::TokeParser->new( \$document, %opt );
|
---|
188 |
|
---|
189 | The object constructor argument is either a file name, a file handle
|
---|
190 | object, or the complete document to be parsed. Extra options can be
|
---|
191 | provided as key/value pairs and are processed as documented by the base
|
---|
192 | classes.
|
---|
193 |
|
---|
194 | If the argument is a plain scalar, then it is taken as the name of a
|
---|
195 | file to be opened and parsed. If the file can't be opened for
|
---|
196 | reading, then the constructor will return C<undef> and $! will tell
|
---|
197 | you why it failed.
|
---|
198 |
|
---|
199 | If the argument is a reference to a plain scalar, then this scalar is
|
---|
200 | taken to be the literal document to parse. The value of this
|
---|
201 | scalar should not be changed before all tokens have been extracted.
|
---|
202 |
|
---|
203 | Otherwise the argument is taken to be some object that the
|
---|
204 | C<HTML::TokeParser> can read() from when it needs more data. Typically
|
---|
205 | it will be a filehandle of some kind. The stream will be read() until
|
---|
206 | EOF, but not closed.
|
---|
207 |
|
---|
208 | A newly constructed C<HTML::TokeParser> differ from its base classes
|
---|
209 | by having the C<unbroken_text> attribute enabled by default. See
|
---|
210 | L<HTML::Parser> for a description of this and other attributes that
|
---|
211 | influence how the document is parsed. It is often a good idea to enable
|
---|
212 | C<empty_element_tags> behaviour.
|
---|
213 |
|
---|
214 | Note that the parsing result will likely not be valid if raw undecoded
|
---|
215 | UTF-8 is used as a source. When parsing UTF-8 encoded files turn
|
---|
216 | on UTF-8 decoding:
|
---|
217 |
|
---|
218 | open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!";
|
---|
219 | my $p = HTML::TokeParser->new( $fh );
|
---|
220 | # ...
|
---|
221 |
|
---|
222 | If a $filename is passed to the constructor the file will be opened in
|
---|
223 | raw mode and the parsing result will only be valid if its content is
|
---|
224 | Latin-1 or pure ASCII.
|
---|
225 |
|
---|
226 | If parsing from an UTF-8 encoded string buffer decode it first:
|
---|
227 |
|
---|
228 | utf8::decode($document);
|
---|
229 | my $p = HTML::TokeParser->new( \$document );
|
---|
230 | # ...
|
---|
231 |
|
---|
232 | =item $p->get_token
|
---|
233 |
|
---|
234 | This method will return the next I<token> found in the HTML document,
|
---|
235 | or C<undef> at the end of the document. The token is returned as an
|
---|
236 | array reference. The first element of the array will be a string
|
---|
237 | denoting the type of this token: "S" for start tag, "E" for end tag,
|
---|
238 | "T" for text, "C" for comment, "D" for declaration, and "PI" for
|
---|
239 | process instructions. The rest of the token array depend on the type
|
---|
240 | like this:
|
---|
241 |
|
---|
242 | ["S", $tag, $attr, $attrseq, $text]
|
---|
243 | ["E", $tag, $text]
|
---|
244 | ["T", $text, $is_data]
|
---|
245 | ["C", $text]
|
---|
246 | ["D", $text]
|
---|
247 | ["PI", $token0, $text]
|
---|
248 |
|
---|
249 | where $attr is a hash reference, $attrseq is an array reference and
|
---|
250 | the rest are plain scalars. The L<HTML::Parser/Argspec> explains the
|
---|
251 | details.
|
---|
252 |
|
---|
253 | =item $p->unget_token( @tokens )
|
---|
254 |
|
---|
255 | If you find you have read too many tokens you can push them back,
|
---|
256 | so that they are returned the next time $p->get_token is called.
|
---|
257 |
|
---|
258 | =item $p->get_tag
|
---|
259 |
|
---|
260 | =item $p->get_tag( @tags )
|
---|
261 |
|
---|
262 | This method returns the next start or end tag (skipping any other
|
---|
263 | tokens), or C<undef> if there are no more tags in the document. If
|
---|
264 | one or more arguments are given, then we skip tokens until one of the
|
---|
265 | specified tag types is found. For example:
|
---|
266 |
|
---|
267 | $p->get_tag("font", "/font");
|
---|
268 |
|
---|
269 | will find the next start or end tag for a font-element.
|
---|
270 |
|
---|
271 | The tag information is returned as an array reference in the same form
|
---|
272 | as for $p->get_token above, but the type code (first element) is
|
---|
273 | missing. A start tag will be returned like this:
|
---|
274 |
|
---|
275 | [$tag, $attr, $attrseq, $text]
|
---|
276 |
|
---|
277 | The tagname of end tags are prefixed with "/", i.e. end tag is
|
---|
278 | returned like this:
|
---|
279 |
|
---|
280 | ["/$tag", $text]
|
---|
281 |
|
---|
282 | =item $p->get_text
|
---|
283 |
|
---|
284 | =item $p->get_text( @endtags )
|
---|
285 |
|
---|
286 | This method returns all text found at the current position. It will
|
---|
287 | return a zero length string if the next token is not text. Any
|
---|
288 | entities will be converted to their corresponding character.
|
---|
289 |
|
---|
290 | If one or more arguments are given, then we return all text occurring
|
---|
291 | before the first of the specified tags found. For example:
|
---|
292 |
|
---|
293 | $p->get_text("p", "br");
|
---|
294 |
|
---|
295 | will return the text up to either a paragraph of linebreak element.
|
---|
296 |
|
---|
297 | The text might span tags that should be I<textified>. This is
|
---|
298 | controlled by the $p->{textify} attribute, which is a hash that
|
---|
299 | defines how certain tags can be treated as text. If the name of a
|
---|
300 | start tag matches a key in this hash then this tag is converted to
|
---|
301 | text. The hash value is used to specify which tag attribute to obtain
|
---|
302 | the text from. If this tag attribute is missing, then the upper case
|
---|
303 | name of the tag enclosed in brackets is returned, e.g. "[IMG]". The
|
---|
304 | hash value can also be a subroutine reference. In this case the
|
---|
305 | routine is called with the start tag token content as its argument and
|
---|
306 | the return value is treated as the text.
|
---|
307 |
|
---|
308 | The default $p->{textify} value is:
|
---|
309 |
|
---|
310 | {img => "alt", applet => "alt"}
|
---|
311 |
|
---|
312 | This means that <IMG> and <APPLET> tags are treated as text, and that
|
---|
313 | the text to substitute can be found in the ALT attribute.
|
---|
314 |
|
---|
315 | =item $p->get_trimmed_text
|
---|
316 |
|
---|
317 | =item $p->get_trimmed_text( @endtags )
|
---|
318 |
|
---|
319 | Same as $p->get_text above, but will collapse any sequences of white
|
---|
320 | space to a single space character. Leading and trailing white space is
|
---|
321 | removed.
|
---|
322 |
|
---|
323 | =item $p->get_phrase
|
---|
324 |
|
---|
325 | This will return all text found at the current position ignoring any
|
---|
326 | phrasal-level tags. Text is extracted until the first non
|
---|
327 | phrasal-level tag. Textification of tags is the same as for
|
---|
328 | get_text(). This method will collapse white space in the same way as
|
---|
329 | get_trimmed_text() does.
|
---|
330 |
|
---|
331 | The definition of <i>phrasal-level tags</i> is obtained from the
|
---|
332 | HTML::Tagset module.
|
---|
333 |
|
---|
334 | =back
|
---|
335 |
|
---|
336 | =head1 EXAMPLES
|
---|
337 |
|
---|
338 | This example extracts all links from a document. It will print one
|
---|
339 | line for each link, containing the URL and the textual description
|
---|
340 | between the <A>...</A> tags:
|
---|
341 |
|
---|
342 | use HTML::TokeParser;
|
---|
343 | $p = HTML::TokeParser->new(shift||"index.html");
|
---|
344 |
|
---|
345 | while (my $token = $p->get_tag("a")) {
|
---|
346 | my $url = $token->[1]{href} || "-";
|
---|
347 | my $text = $p->get_trimmed_text("/a");
|
---|
348 | print "$url\t$text\n";
|
---|
349 | }
|
---|
350 |
|
---|
351 | This example extract the <TITLE> from the document:
|
---|
352 |
|
---|
353 | use HTML::TokeParser;
|
---|
354 | $p = HTML::TokeParser->new(shift||"index.html");
|
---|
355 | if ($p->get_tag("title")) {
|
---|
356 | my $title = $p->get_trimmed_text;
|
---|
357 | print "Title: $title\n";
|
---|
358 | }
|
---|
359 |
|
---|
360 | =head1 SEE ALSO
|
---|
361 |
|
---|
362 | L<HTML::PullParser>, L<HTML::Parser>
|
---|
363 |
|
---|
364 | =head1 COPYRIGHT
|
---|
365 |
|
---|
366 | Copyright 1998-2005 Gisle Aas.
|
---|
367 |
|
---|
368 | This library is free software; you can redistribute it and/or
|
---|
369 | modify it under the same terms as Perl itself.
|
---|
370 |
|
---|
371 | =cut
|
---|