source: trunk/gsdl/perllib/cpan/HTML/TokeParser/Simple.pm@ 14078

Last change on this file since 14078 was 13983, checked in by lh92, 17 years ago

Added for Realistic Book Project

  • Property svn:keywords set to Author Date Id Revision
File size: 19.7 KB
Line 
1package HTML::TokeParser::Simple;
2
3use strict;
4use HTML::TokeParser;
5use HTML::TokeParser::Simple::Token;
6use HTML::TokeParser::Simple::Token::Tag;
7use HTML::TokeParser::Simple::Token::Tag::Start;
8use HTML::TokeParser::Simple::Token::Tag::End;
9use HTML::TokeParser::Simple::Token::Text;
10use HTML::TokeParser::Simple::Token::Comment;
11use HTML::TokeParser::Simple::Token::Declaration;
12use HTML::TokeParser::Simple::Token::ProcessInstruction;
13
14use vars qw/ @ISA $VERSION $REVISION /;
15
16$REVISION = '$Id: Simple.pm 13983 2007-03-15 01:32:44Z lh92 $';
17$VERSION = '3.15';
18@ISA = qw/ HTML::TokeParser /;
19
20# constructors
21
22my %FACTORY_CLASSES = (
23 S => 'HTML::TokeParser::Simple::Token::Tag::Start',
24 E => 'HTML::TokeParser::Simple::Token::Tag::End',
25 T => 'HTML::TokeParser::Simple::Token::Text',
26 C => 'HTML::TokeParser::Simple::Token::Comment',
27 D => 'HTML::TokeParser::Simple::Token::Declaration',
28 PI => 'HTML::TokeParser::Simple::Token::ProcessInstruction',
29);
30
31sub _croak {
32 my ($proto, $message) = @_;
33 require Carp;
34 Carp::croak($message);
35}
36
37sub new {
38 my ($class, @args) = @_;
39 return 1 == @args
40 ? $class->SUPER::new(@args)
41 : $class->_init(@args);
42}
43
44sub _init {
45 my ($class, $source_type, $source) = @_;
46 my %sources = (
47 file => sub { $source },
48 handle => sub { $source },
49 string => sub { \$source },
50 url => sub {
51 eval "require LWP::Simple";
52 $class->_croak("Cannot load LWP::Simple: $@") if $@;
53 my $content = LWP::Simple::get($source);
54 $class->_croak("Could not fetch content from ($source)")
55 unless defined $content;
56 return \$content;
57 },
58 );
59 unless (exists $sources{$source_type}) {
60 $class->_croak("Unknown source type ($source_type)");
61 }
62 return $class->new($sources{$source_type}->());
63}
64
65sub get_token {
66 my $self = shift;
67 my @args = @_;
68 my $token = $self->SUPER::get_token( @args );
69 return unless defined $token;
70 if (my $factory_class = $FACTORY_CLASSES{$token->[0]}) {
71 return $factory_class->new($token);
72 }
73 else {
74 # this should never happen
75 $self->_croak("Cannot determine token class for token (@$token)");
76 }
77}
78
79sub get_tag {
80 my $self = shift;
81 my @args = @_;
82 my $token = $self->SUPER::get_tag( @args );
83 return unless defined $token;
84 return $token->[0] =~ /^\//
85 ? HTML::TokeParser::Simple::Token::Tag::End->new($token)
86 : HTML::TokeParser::Simple::Token::Tag::Start->new($token);
87}
88
89sub peek {
90 my ($self, $count) = @_;
91 $count ||= 1;
92
93 unless ($count =~ /^\d+$/) {
94 $self->_croak("Argument to peek() must be a positive integer, not ($count)");
95 }
96
97 my $items = 0;
98 my $html = '';
99 my @tokens;
100 while ( $items++ < $count && defined ( my $token = $self->get_token ) ) {
101 $html .= $token->as_is;
102 push @tokens, $token;
103 }
104 $self->unget_token(@tokens);
105 return $html;
106}
107
1081;
109
110__END__
111
112=head1 NAME
113
114HTML::TokeParser::Simple - Easy to use C<HTML::TokeParser> interface
115
116=head1 SYNOPSIS
117
118 use HTML::TokeParser::Simple;
119 my $p = HTML::TokeParser::Simple->new( $somefile );
120
121 while ( my $token = $p->get_token ) {
122 # This prints all text in an HTML doc (i.e., it strips the HTML)
123 next unless $token->is_text;
124 print $token->as_is;
125 }
126
127
128=head1 DESCRIPTION
129
130C<HTML::TokeParser> is an excellent module that's often used for parsing HTML.
131However, the tokens returned are not exactly intuitive to parse:
132
133 ["S", $tag, $attr, $attrseq, $text]
134 ["E", $tag, $text]
135 ["T", $text, $is_data]
136 ["C", $text]
137 ["D", $text]
138 ["PI", $token0, $text]
139
140To simplify this, C<HTML::TokeParser::Simple> allows the user ask more
141intuitive (read: more self-documenting) questions about the tokens returned.
142
143You can also rebuild some tags on the fly. Frequently, the attributes
144associated with start tags need to be altered, added to, or deleted. This
145functionality is built in.
146
147Since this is a subclass of C<HTML::TokeParser>, all C<HTML::TokeParser>
148methods are available. To truly appreciate the power of this module, please
149read the documentation for C<HTML::TokeParser> and C<HTML::Parser>.
150
151=head1 CONTRUCTORS
152
153=head2 C<new($source)>
154
155The constructor for C<HTML::TokeParser::Simple> can be used just like
156C<HTML::TokeParser>'s constructor:
157
158 my $parser = HTML::TokeParser::Simple->new($filename);
159 # or
160 my $parser = HTML::TokeParser::Simple->new($filehandle);
161 # or
162 my $parser = HTML::TokeParser::Simple->new(\$html_string);
163
164=head2 C<new($source_type, $source)>
165
166If you wish to be more explicit, there is a new style of
167constructor avaiable.
168
169 my $parser = HTML::TokeParser::Simple->new(file => $filename);
170 # or
171 my $parser = HTML::TokeParser::Simple->new(handle => $filehandle);
172 # or
173 my $parser = HTML::TokeParser::Simple->new(string => $html_string);
174
175Note that you do not have to provide a reference for the string if using the
176string constructor.
177
178As a convenience, you can also attempt to fetch the HTML directly from a URL.
179
180 my $parser = HTML::TokeParser::Simple->new(url => 'http://some.url');
181
182This method relies on C<LWP::Simple>. If this module is not found or the page
183cannot be fetched, the constructor will C<croak()>.
184
185=head1 PARSER METHODS
186
187=head2 get_token
188
189This method will return the next token that C<HTML::TokeParser::get_token()>
190method would return. However, it will be blessed into a class appropriate
191which represents the token type.
192
193=head2 get_tag
194
195This method will return the next token that C<HTML::TokeParser::get_tag()>
196method would return. However, it will be blessed into either the
197L<HTML::TokeParser::Simple::Token::Tag::Start> or
198L<HTML::TokeParser::Simple::Token::Tag::End> class.
199
200=head2 peek
201
202As of version C<3.14>, you can now C<peek()> at the upcomings tokens without
203affecting the state of the parser. By default, C<peek()> will return the text
204of the next token, but specifying an integer C<$count> will return the text of
205the next C<$count> tokens.
206
207This is useful when you're trying to debug where you are in a document.
208
209 warn $parser->peek(3); # show the next 3 tokens
210
211=head1 ACCESSORS
212
213The following methods may be called on the token object which is returned,
214not on the parser object.
215
216=head2 Boolean Accessors
217
218These accessors return true or false.
219
220=over 4
221
222=item * C<is_tag([$tag])>
223
224Use this to determine if you have any tag. An optional "tag type" may be
225passed. This will allow you to match if it's a I<particular> tag. The
226supplied tag is case-insensitive.
227
228 if ( $token->is_tag ) { ... }
229
230Optionally, you may pass a regular expression as an argument.
231
232=item * C<is_start_tag([$tag])>
233
234Use this to determine if you have a start tag. An optional "tag type" may be
235passed. This will allow you to match if it's a I<particular> start tag. The
236supplied tag is case-insensitive.
237
238 if ( $token->is_start_tag ) { ... }
239 if ( $token->is_start_tag( 'font' ) ) { ... }
240
241Optionally, you may pass a regular expression as an argument. To match all
242header (h1, h2, ... h6) tags:
243
244 if ( $token->is_start_tag( qr/^h[123456]$/ ) ) { ... }
245
246=item * C<is_end_tag([$tag])>
247
248Use this to determine if you have an end tag. An optional "tag type" may be
249passed. This will allow you to match if it's a I<particular> end tag. The
250supplied tag is case-insensitive.
251
252When testing for an end tag, the forward slash on the tag is optional.
253
254 while ( $token = $p->get_token ) {
255 if ( $token->is_end_tag( 'form' ) ) { ... }
256 }
257
258Or:
259
260 while ( $token = $p->get_token ) {
261 if ( $token->is_end_tag( '/form' ) ) { ... }
262 }
263
264Optionally, you may pass a regular expression as an argument.
265
266=item * C<is_text()>
267
268Use this to determine if you have text. Note that this is I<not> to be
269confused with the C<return_text> (I<deprecated>) method described below!
270C<is_text> will identify text that the user typically sees display in the Web
271browser.
272
273=item * C<is_comment()>
274
275Are you still reading this? Nobody reads POD. Don't you know you're supposed
276to go to CLPM, ask a question that's answered in the POD and get flamed? It's
277a rite of passage.
278
279Really.
280
281C<is_comment> is used to identify comments. See the HTML::Parser documentation
282for more information about comments. There's more than you might think.
283
284=item * C<is_declaration()>
285
286This will match the DTD at the top of your HTML. (You I<do> use DTD's, don't
287you?)
288
289=item * C<is_process_instruction()>
290
291Process Instructions are from XML. This is very handy if you need to parse out
292PHP and similar things with a parser.
293
294Currently, there appear to be some problems with process instructions. You can
295override C<HTML::TokeParser::Simple::Token::ProcessInstruction> if you need to.
296
297=item * C<is_pi()>
298
299This is a shorthand for C<is_process_instruction()>.
300
301=back
302
303=head2 Data Accessors
304
305Some of these were originally C<return_> methods, but that name was not only
306unwieldy, but also went against reasonable conventions. The C<get_> methods
307listed below still have C<return_> methods available for backwards
308compatibility reasons, but they merely call their C<get_> counterpart. For
309example, calling C<return_tag()> actually calls C<get_tag()> internally.
310
311=over 4
312
313=item * C<get_tag()>
314
315Do you have a start tag or end tag? This will return the type (lower case).
316Note that this is I<not> the same as the C<get_tag()> method on the actual
317parser object.
318
319=item * C<get_attr([$attribute])>
320
321If you have a start tag, this will return a hash ref with the attribute names
322as keys and the values as the values.
323
324If you pass in an attribute name, it will return the value for just that
325attribute.
326
327Returns false if the token is not a start tag.
328
329=item * C<get_attrseq()>
330
331For a start tag, this is an array reference with the sequence of the
332attributes, if any.
333
334Returns false if the token is not a start tag.
335
336=item * C<return_text()>
337
338This method has been heavily deprecated (for a couple of years) in favor of
339C<as_is>. Programmers were getting confused over the difference between
340C<is_text>, C<return_text>, and some parser methods such as
341C<HTML::TokeParser::get_text> and friends.
342
343Using this method still succeeds, but will now carp and B<will be removed>
344in the next major release of this module.
345
346=item * C<as_is()>
347
348This is the exact text of whatever the token is representing.
349
350=item * C<get_token0()>
351
352For processing instructions, this will return the token found immediately after
353the opening tag. Example: For <?php, "php" will be the start of the returned
354string.
355
356Note that process instruction handling appears to be incomplete in
357C<HTML::TokeParser>.
358
359Returns false if the token is not a process instruction.
360
361=back
362
363=head1 MUTATORS
364
365The C<delete_attr()> and C<set_attr()> methods allow the programmer to rewrite
366start tag attributes on the fly. It should be noted that bad HTML will be
367"corrected" by this. Specifically, the new tag will have all attributes
368lower-cased with the values properly quoted.
369
370Self-closing tags (e.g. E<lt>hr /E<gt>) are also handled correctly. Some older
371browsers require a space prior to the final slash in a self-closed tag. If
372such a space is detected in the original HTML, it will be preserved.
373
374Calling a mutator on an token type that does not support that property is a
375no-op. For example:
376
377 if ($token->is_comment) {
378 $token->set_attr(foo => 'bar'); # does nothing
379 }
380
381=over 4
382
383=item * C<delete_attr($name)>
384
385This method attempts to delete the attribute specified. It will silently fail
386if called on anything other than a start tag. The argument is
387case-insensitive, but must otherwise be an exact match of the attribute you are
388attempting to delete. If the attribute is not found, the method will return
389without changing the tag.
390
391 # <body bgcolor="#FFFFFF">
392 $token->delete_attr('bgcolor');
393 print $token->as_is;
394 # <body>
395
396After this method is called, if successful, the C<as_is()>, C<get_attr()>
397and C<get_attrseq()> methods will all return updated results.
398
399=item * C<set_attr($name,$value)>
400
401This method will set the value of an attribute. If the attribute is not found,
402then C<get_attrseq()> will have the new attribute listed at the end.
403
404 # <p>
405 $token->set_attr(class => 'some_class');
406 print $token->as_is;
407 # <p class="some_class">
408
409 # <body bgcolor="#FFFFFF">
410 $token->set_attr('bgcolor','red');
411 print $token->as_is;
412 # <body bgcolor="red">
413
414After this method is called, if successful, the C<as_is()>, C<get_attr()>
415and C<get_attrseq()> methods will all return updated results.
416
417=item * C<set_attr($hashref)>
418
419Under the premise that C<set_> methods should accept what their corresponding
420C<get_> methods emit, the following works:
421
422 $tag->set_attr($tag->get_attr);
423
424Theoretically that's a no-op and for purposes of rendering HTML, it should be.
425However, internally this calls C<$tag-E<gt>rewrite_tag>, so see that method to
426understand how this may affect you.
427
428Of course, this is useless if you want to actually change the attributes, so you
429can do this:
430
431 my $attrs = {
432 class => 'headline',
433 valign => 'top'
434 };
435 $token->set_attr($attrs)
436 if $token->is_start_tag('td') && $token->get_attr('class') eq 'stories';
437
438=item * C<rewrite_tag()>
439
440This method rewrites the tag. The tag name and the name of all attributes will
441be lower-cased. Values that are not quoted with double quotes will be. This
442may be called on both start or end tags. Note that both C<set_attr()> and
443C<delete_attr()> call this method prior to returning.
444
445If called on a token that is not a tag, it simply returns. Regardless of how
446it is called, it returns the token.
447
448 # <body alink=#0000ff BGCOLOR=#ffffff class='none'>
449 $token->rewrite_tag;
450 print $token->as_is;
451 # <body alink="#0000ff" bgcolor="#ffffff" class="none">
452
453A quick cleanup of sloppy HTML is now the following:
454
455 my $parser = HTML::TokeParser::Simple->new( string => $ugly_html );
456 while (my $token = $parser->get_token) {
457 $token->rewrite_tag;
458 print $token->as_is;
459 }
460
461=back
462
463=head1 PARSER VERSUS TOKENS
464
465The parser returns tokens that are blessed into appropriate classes. Some
466people get confused and try to call parser methods on tokens and token methods
467on the parser. To prevent this, C<HTML::TokeParser::Simple> versions 1.4 and
468above now bless all tokens into appropriate token classes. Please keep this in
469mind while using this module (and many thanks to PodMaster
470L<http://www.perlmonks.org/index.pl?node_id=107642> for pointing out this issue
471to me.)
472
473=head1 EXAMPLES
474
475=head2 Finding comments
476
477For some strange reason, your Pointy-Haired Boss (PHB) is convinced that the
478graphics department is making fun of him by embedding rude things about him in
479HTML comments. You need to get all HTML comments from the HTML.
480
481 use strict;
482 use HTML::TokeParser::Simple;
483
484 my @html_docs = glob( "*.html" );
485
486 open PHB, "> phbreport.txt" or die "Cannot open phbreport for writing: $!";
487
488 foreach my $doc ( @html_docs ) {
489 print "Processing $doc\n";
490 my $p = HTML::TokeParser::Simple->new( file => $doc );
491 while ( my $token = $p->get_token ) {
492 next unless $token->is_comment;
493 print PHB $token->as_is, "\n";
494 }
495 }
496
497 close PHB;
498
499=head2 Stripping Comments
500
501Uh oh. Turns out that your PHB was right for a change. Many of the comments
502in the HTML weren't very polite. Since your entire graphics department was
503just fired, it falls on you need to strip those comments from the HTML.
504
505 use strict;
506 use HTML::TokeParser::Simple;
507
508 my $new_folder = 'no_comment/';
509 my @html_docs = glob( "*.html" );
510
511 foreach my $doc ( @html_docs ) {
512 print "Processing $doc\n";
513 my $new_file = "$new_folder$doc";
514
515 open PHB, "> $new_file" or die "Cannot open $new_file for writing: $!";
516
517 my $p = HTML::TokeParser::Simple->new( $file => doc );
518 while ( my $token = $p->get_token ) {
519 next if $token->is_comment;
520 print PHB $token->as_is;
521 }
522 close PHB;
523 }
524
525=head2 Changing form tags
526
527Your company was foo.com and now is bar.com. Unfortunately, whoever wrote your
528HTML decided to hardcode "http://www.foo.com/" into the C<action> attribute of
529the form tags. You need to change it to "http://www.bar.com/".
530
531 use strict;
532 use HTML::TokeParser::Simple;
533
534 my $new_folder = 'new_html/';
535 my @html_docs = glob( "*.html" );
536
537 foreach my $doc ( @html_docs ) {
538 print "Processing $doc\n";
539 my $new_file = "$new_folder$doc";
540
541 open FILE, "> $new_file" or die "Cannot open $new_file for writing: $!";
542
543 my $p = HTML::TokeParser::Simple->new( file => $doc );
544 while ( my $token = $p->get_token ) {
545 if ( $token->is_start_tag('form') ) {
546 my $action = $token->get_attr(action);
547 $action =~ s/www\.foo\.com/www.bar.com/;
548 $token->set_attr('action', $action);
549 }
550 print FILE $token->as_is;
551 }
552 close FILE;
553 }
554
555=head1 CAVEATS
556
557For compatability reasons with C<HTML::TokeParser>, methods that return
558references are violating encapsulation and altering the references directly
559B<will> alter the state of the object. Subsequent calls to C<rewrite_tag()>
560can thus have unexpected results. Do not alter these references directly
561unless you are following behavior described in these docs. In the future,
562certain methods such as C<get_attr>, C<get_attrseq> and others may return a
563copy of the reference rather than the original reference. This behavior has
564not yet been changed in order to maintain compatability with previous versions
565of this module. At the present time, your author is not aware of anyone taking
566advantage of this "feature," but it's better to be safe than sorry.
567
568Use of C<$HTML::Parser::VERSION> which is less than 3.25 may result in
569incorrect behavior as older versions do not always handle XHTML correctly. It
570is the programmer's responsibility to verify that the behavior of this code
571matches the programmer's needs.
572
573Note that C<HTML::Parser> processes text in 512 byte chunks. This sometimes
574will cause strange behavior and cause text to be broken into more than one
575token. You can suppress this behavior with the following command:
576
577 $p->unbroken_text( [$bool] );
578
579See the C<HTML::Parser> documentation and
580http://www.perlmonks.org/index.pl?node_id=230667 for more information.
581
582=head1 BUGS
583
584There are no known bugs, but that's no guarantee.
585
586Address bug reports and comments to: E<lt>[email protected]<gt>. When
587sending bug reports, please provide the version of C<HTML::Parser>,
588C<HTML::TokeParser>, C<HTML::TokeParser::Simple>, the version of Perl, and the
589version of the operating system you are using.
590
591Reverse the name to email the author.
592
593=head1 SUBCLASSING
594
595You may wish to change the behavior of this module. You probably do not want
596to subclass C<HTML::TokeParser::Simple>. Instead, you'll want to subclass one
597of the token classes. C<HTML::TokeParser::Simple::Token> is the base class for
598all tokens. Global behavioral changes should go there. Otherwise, see the
599appropriate token class for the behavior you wish to alter.
600
601=head1 SEE ALSO
602
603L<HTML::TokeParser::Simple::Token>
604
605L<HTML::TokeParser::Simple::Token::Tag>
606
607L<HTML::TokeParser::Simple::Token::Text>
608
609L<HTML::TokeParser::Simple::Token::Comment>
610
611L<HTML::TokeParser::Simple::Token::Declaration>
612
613L<HTML::TokeParser::Simple::Token::ProcessInstruction>
614
615=head1 COPYRIGHT
616
617Copyright (c) 2004 by Curtis "Ovid" Poe. All rights reserved. This program is
618free software; you may redistribute it and/or modify it under the same terms as
619Perl itself
620
621=head1 AUTHOR
622
623Curtis "Ovid" Poe E<lt>[email protected]<gt>
624
625Reverse the name to email the author.
626
627=cut
Note: See TracBrowser for help on using the repository browser.