source: trunk/gsdl/perllib/cpan/HTML/TokeParser.pm@ 14078

Last change on this file since 14078 was 14078, checked in by lh92, 17 years ago

Perl modules required for HTMLTidy

  • Property svn:keywords set to Author Date Id Revision
File size: 9.8 KB
Line 
1package HTML::TokeParser;
2
3# $Id: TokeParser.pm 14078 2007-05-17 03:15:41Z lh92 $
4
5require HTML::PullParser;
6@ISA=qw(HTML::PullParser);
7$VERSION = sprintf("%d.%02d", q$Revision: 14078 $ =~ /(\d+)\.(\d+)/);
8
9use strict;
10use Carp ();
11use HTML::Entities qw(decode_entities);
12use HTML::Tagset ();
13
14my %ARGS =
15(
16 start => "'S',tagname,attr,attrseq,text",
17 end => "'E',tagname,text",
18 text => "'T',text,is_cdata",
19 process => "'PI',token0,text",
20 comment => "'C',text",
21 declaration => "'D',text",
22
23 # options that default on
24 unbroken_text => 1,
25);
26
27
28sub new
29{
30 my $class = shift;
31 my %cnf;
32 if (@_ == 1) {
33 my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
34 %cnf = ($type => $_[0]);
35 }
36 else {
37 %cnf = @_;
38 }
39
40 my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
41
42 my $self = $class->SUPER::new(%cnf, %ARGS) || return undef;
43
44 $self->{textify} = $textify;
45 $self;
46}
47
48
49sub get_tag
50{
51 my $self = shift;
52 my $token;
53 while (1) {
54 $token = $self->get_token || return undef;
55 my $type = shift @$token;
56 next unless $type eq "S" || $type eq "E";
57 substr($token->[0], 0, 0) = "/" if $type eq "E";
58 return $token unless @_;
59 for (@_) {
60 return $token if $token->[0] eq $_;
61 }
62 }
63}
64
65
66sub _textify {
67 my($self, $token) = @_;
68 my $tag = $token->[1];
69 return undef unless exists $self->{textify}{$tag};
70
71 my $alt = $self->{textify}{$tag};
72 my $text;
73 if (ref($alt)) {
74 $text = &$alt(@$token);
75 } else {
76 $text = $token->[2]{$alt || "alt"};
77 $text = "[\U$tag]" unless defined $text;
78 }
79 return $text;
80}
81
82
83sub get_text
84{
85 my $self = shift;
86 my @text;
87 while (my $token = $self->get_token) {
88 my $type = $token->[0];
89 if ($type eq "T") {
90 my $text = $token->[1];
91 decode_entities($text) unless $token->[2];
92 push(@text, $text);
93 } elsif ($type =~ /^[SE]$/) {
94 my $tag = $token->[1];
95 if ($type eq "S") {
96 if (defined(my $text = _textify($self, $token))) {
97 push(@text, $text);
98 next;
99 }
100 } else {
101 $tag = "/$tag";
102 }
103 if (!@_ || grep $_ eq $tag, @_) {
104 $self->unget_token($token);
105 last;
106 }
107 push(@text, " ")
108 if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]};
109 }
110 }
111 join("", @text);
112}
113
114
115sub get_trimmed_text
116{
117 my $self = shift;
118 my $text = $self->get_text(@_);
119 $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
120 $text;
121}
122
123sub get_phrase {
124 my $self = shift;
125 my @text;
126 while (my $token = $self->get_token) {
127 my $type = $token->[0];
128 if ($type eq "T") {
129 my $text = $token->[1];
130 decode_entities($text) unless $token->[2];
131 push(@text, $text);
132 } elsif ($type =~ /^[SE]$/) {
133 my $tag = $token->[1];
134 if ($type eq "S") {
135 if (defined(my $text = _textify($self, $token))) {
136 push(@text, $text);
137 next;
138 }
139 }
140 if (!$HTML::Tagset::isPhraseMarkup{$tag}) {
141 $self->unget_token($token);
142 last;
143 }
144 push(@text, " ") if $tag eq "br";
145 }
146 }
147 my $text = join("", @text);
148 $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
149 $text;
150}
151
1521;
153
154
155__END__
156
157=head1 NAME
158
159HTML::TokeParser - Alternative HTML::Parser interface
160
161=head1 SYNOPSIS
162
163 require HTML::TokeParser;
164 $p = HTML::TokeParser->new("index.html") ||
165 die "Can't open: $!";
166 $p->empty_element_tags(1); # configure its behaviour
167
168 while (my $token = $p->get_token) {
169 #...
170 }
171
172=head1 DESCRIPTION
173
174The C<HTML::TokeParser> is an alternative interface to the
175C<HTML::Parser> class. It is an C<HTML::PullParser> subclass with a
176predeclared set of token types. If you wish the tokens to be reported
177differently you probably want to use the C<HTML::PullParser> directly.
178
179The following methods are available:
180
181=over 4
182
183=item $p = HTML::TokeParser->new( $filename, %opt );
184
185=item $p = HTML::TokeParser->new( $filehandle, %opt );
186
187=item $p = HTML::TokeParser->new( \$document, %opt );
188
189The object constructor argument is either a file name, a file handle
190object, or the complete document to be parsed. Extra options can be
191provided as key/value pairs and are processed as documented by the base
192classes.
193
194If the argument is a plain scalar, then it is taken as the name of a
195file to be opened and parsed. If the file can't be opened for
196reading, then the constructor will return C<undef> and $! will tell
197you why it failed.
198
199If the argument is a reference to a plain scalar, then this scalar is
200taken to be the literal document to parse. The value of this
201scalar should not be changed before all tokens have been extracted.
202
203Otherwise the argument is taken to be some object that the
204C<HTML::TokeParser> can read() from when it needs more data. Typically
205it will be a filehandle of some kind. The stream will be read() until
206EOF, but not closed.
207
208A newly constructed C<HTML::TokeParser> differ from its base classes
209by having the C<unbroken_text> attribute enabled by default. See
210L<HTML::Parser> for a description of this and other attributes that
211influence how the document is parsed. It is often a good idea to enable
212C<empty_element_tags> behaviour.
213
214Note that the parsing result will likely not be valid if raw undecoded
215UTF-8 is used as a source. When parsing UTF-8 encoded files turn
216on UTF-8 decoding:
217
218 open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!";
219 my $p = HTML::TokeParser->new( $fh );
220 # ...
221
222If a $filename is passed to the constructor the file will be opened in
223raw mode and the parsing result will only be valid if its content is
224Latin-1 or pure ASCII.
225
226If parsing from an UTF-8 encoded string buffer decode it first:
227
228 utf8::decode($document);
229 my $p = HTML::TokeParser->new( \$document );
230 # ...
231
232=item $p->get_token
233
234This method will return the next I<token> found in the HTML document,
235or C<undef> at the end of the document. The token is returned as an
236array reference. The first element of the array will be a string
237denoting the type of this token: "S" for start tag, "E" for end tag,
238"T" for text, "C" for comment, "D" for declaration, and "PI" for
239process instructions. The rest of the token array depend on the type
240like this:
241
242 ["S", $tag, $attr, $attrseq, $text]
243 ["E", $tag, $text]
244 ["T", $text, $is_data]
245 ["C", $text]
246 ["D", $text]
247 ["PI", $token0, $text]
248
249where $attr is a hash reference, $attrseq is an array reference and
250the rest are plain scalars. The L<HTML::Parser/Argspec> explains the
251details.
252
253=item $p->unget_token( @tokens )
254
255If you find you have read too many tokens you can push them back,
256so that they are returned the next time $p->get_token is called.
257
258=item $p->get_tag
259
260=item $p->get_tag( @tags )
261
262This method returns the next start or end tag (skipping any other
263tokens), or C<undef> if there are no more tags in the document. If
264one or more arguments are given, then we skip tokens until one of the
265specified tag types is found. For example:
266
267 $p->get_tag("font", "/font");
268
269will find the next start or end tag for a font-element.
270
271The tag information is returned as an array reference in the same form
272as for $p->get_token above, but the type code (first element) is
273missing. A start tag will be returned like this:
274
275 [$tag, $attr, $attrseq, $text]
276
277The tagname of end tags are prefixed with "/", i.e. end tag is
278returned like this:
279
280 ["/$tag", $text]
281
282=item $p->get_text
283
284=item $p->get_text( @endtags )
285
286This method returns all text found at the current position. It will
287return a zero length string if the next token is not text. Any
288entities will be converted to their corresponding character.
289
290If one or more arguments are given, then we return all text occurring
291before the first of the specified tags found. For example:
292
293 $p->get_text("p", "br");
294
295will return the text up to either a paragraph of linebreak element.
296
297The text might span tags that should be I<textified>. This is
298controlled by the $p->{textify} attribute, which is a hash that
299defines how certain tags can be treated as text. If the name of a
300start tag matches a key in this hash then this tag is converted to
301text. The hash value is used to specify which tag attribute to obtain
302the text from. If this tag attribute is missing, then the upper case
303name of the tag enclosed in brackets is returned, e.g. "[IMG]". The
304hash value can also be a subroutine reference. In this case the
305routine is called with the start tag token content as its argument and
306the return value is treated as the text.
307
308The default $p->{textify} value is:
309
310 {img => "alt", applet => "alt"}
311
312This means that <IMG> and <APPLET> tags are treated as text, and that
313the text to substitute can be found in the ALT attribute.
314
315=item $p->get_trimmed_text
316
317=item $p->get_trimmed_text( @endtags )
318
319Same as $p->get_text above, but will collapse any sequences of white
320space to a single space character. Leading and trailing white space is
321removed.
322
323=item $p->get_phrase
324
325This will return all text found at the current position ignoring any
326phrasal-level tags. Text is extracted until the first non
327phrasal-level tag. Textification of tags is the same as for
328get_text(). This method will collapse white space in the same way as
329get_trimmed_text() does.
330
331The definition of <i>phrasal-level tags</i> is obtained from the
332HTML::Tagset module.
333
334=back
335
336=head1 EXAMPLES
337
338This example extracts all links from a document. It will print one
339line for each link, containing the URL and the textual description
340between the <A>...</A> tags:
341
342 use HTML::TokeParser;
343 $p = HTML::TokeParser->new(shift||"index.html");
344
345 while (my $token = $p->get_tag("a")) {
346 my $url = $token->[1]{href} || "-";
347 my $text = $p->get_trimmed_text("/a");
348 print "$url\t$text\n";
349 }
350
351This example extract the <TITLE> from the document:
352
353 use HTML::TokeParser;
354 $p = HTML::TokeParser->new(shift||"index.html");
355 if ($p->get_tag("title")) {
356 my $title = $p->get_trimmed_text;
357 print "Title: $title\n";
358 }
359
360=head1 SEE ALSO
361
362L<HTML::PullParser>, L<HTML::Parser>
363
364=head1 COPYRIGHT
365
366Copyright 1998-2005 Gisle Aas.
367
368This library is free software; you can redistribute it and/or
369modify it under the same terms as Perl itself.
370
371=cut
Note: See TracBrowser for help on using the repository browser.