source: trunk/gsdl/perllib/cpan/HTML/PullParser.pm@ 14078

Last change on this file since 14078 was 14078, checked in by lh92, 17 years ago

Perl modules required for HTMLTidy

  • Property svn:keywords set to Author Date Id Revision
File size: 5.6 KB
Line 
1package HTML::PullParser;
2
3# $Id: PullParser.pm 14078 2007-05-17 03:15:41Z lh92 $
4
5require HTML::Parser;
6@ISA=qw(HTML::Parser);
7$VERSION = sprintf("%d.%02d", q$Revision: 14078 $ =~ /(\d+)\.(\d+)/);
8
9use strict;
10use Carp ();
11
12sub new
13{
14 my($class, %cnf) = @_;
15
16 # Construct argspecs for the various events
17 my %argspec;
18 for (qw(start end text declaration comment process default)) {
19 my $tmp = delete $cnf{$_};
20 next unless defined $tmp;
21 $argspec{$_} = $tmp;
22 }
23 Carp::croak("Info not collected for any events")
24 unless %argspec;
25
26 my $file = delete $cnf{file};
27 my $doc = delete $cnf{doc};
28 Carp::croak("Can't parse from both 'doc' and 'file' at the same time")
29 if defined($file) && defined($doc);
30 Carp::croak("No 'doc' or 'file' given to parse from")
31 unless defined($file) || defined($doc);
32
33 # Create object
34 $cnf{api_version} = 3;
35 my $self = $class->SUPER::new(%cnf);
36
37 my $accum = $self->{pullparser_accum} = [];
38 while (my($event, $argspec) = each %argspec) {
39 $self->SUPER::handler($event => $accum, $argspec);
40 }
41
42 if (defined $doc) {
43 $self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc;
44 $self->{pullparser_str_pos} = 0;
45 }
46 else {
47 if (!ref($file) && ref(\$file) ne "GLOB") {
48 require IO::File;
49 $file = IO::File->new($file, "r") || return;
50 }
51
52 $self->{pullparser_file} = $file;
53 }
54 $self;
55}
56
57
58sub handler
59{
60 Carp::croak("Can't set handlers for HTML::PullParser");
61}
62
63
64sub get_token
65{
66 my $self = shift;
67 while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) {
68 if (my $f = $self->{pullparser_file}) {
69 # must try to parse more from the file
70 my $buf;
71 if (read($f, $buf, 512)) {
72 $self->parse($buf);
73 } else {
74 $self->eof;
75 $self->{pullparser_eof}++;
76 delete $self->{pullparser_file};
77 }
78 }
79 elsif (my $sref = $self->{pullparser_str_ref}) {
80 # must try to parse more from the scalar
81 my $pos = $self->{pullparser_str_pos};
82 my $chunk = substr($$sref, $pos, 512);
83 $self->parse($chunk);
84 $pos += length($chunk);
85 if ($pos < length($$sref)) {
86 $self->{pullparser_str_pos} = $pos;
87 }
88 else {
89 $self->eof;
90 $self->{pullparser_eof}++;
91 delete $self->{pullparser_str_ref};
92 delete $self->{pullparser_str_pos};
93 }
94 }
95 else {
96 die;
97 }
98 }
99 shift @{$self->{pullparser_accum}};
100}
101
102
103sub unget_token
104{
105 my $self = shift;
106 unshift @{$self->{pullparser_accum}}, @_;
107 $self;
108}
109
1101;
111
112
113__END__
114
115=head1 NAME
116
117HTML::PullParser - Alternative HTML::Parser interface
118
119=head1 SYNOPSIS
120
121 use HTML::PullParser;
122
123 $p = HTML::PullParser->new(file => "index.html",
124 start => 'event, tagname, @attr',
125 end => 'event, tagname',
126 ignore_elements => [qw(script style)],
127 ) || die "Can't open: $!";
128 while (my $token = $p->get_token) {
129 #...do something with $token
130 }
131
132=head1 DESCRIPTION
133
134The HTML::PullParser is an alternative interface to the HTML::Parser class.
135It basically turns the HTML::Parser inside out. You associate a file
136(or any IO::Handle object or string) with the parser at construction time and
137then repeatedly call $parser->get_token to obtain the tags and text
138found in the parsed document.
139
140The following methods are provided:
141
142=over 4
143
144=item $p = HTML::PullParser->new( file => $file, %options )
145
146=item $p = HTML::PullParser->new( doc => \$doc, %options )
147
148A C<HTML::PullParser> can be made to parse from either a file or a
149literal document based on whether the C<file> or C<doc> option is
150passed to the parser's constructor.
151
152The C<file> passed in can either be a file name or a file handle
153object. If a file name is passed, and it can't be opened for reading,
154then the constructor will return an undefined value and $! will tell
155you why it failed. Otherwise the argument is taken to be some object
156that the C<HTML::PullParser> can read() from when it needs more data.
157The stream will be read() until EOF, but not closed.
158
159A C<doc> can be passed plain or as a reference
160to a scalar. If a reference is passed then the value of this scalar
161should not be changed before all tokens have been extracted.
162
163Next the information to be returned for the different token types must
164be set up. This is done by simply associating an argspec (as defined
165in L<HTML::Parser>) with the events you have an interest in. For
166instance, if you want C<start> tokens to be reported as the string
167C<'S'> followed by the tagname and the attributes you might pass an
168C<start>-option like this:
169
170 $p = HTML::PullParser->new(
171 doc => $document_to_parse,
172 start => '"S", tagname, @attr',
173 end => '"E", tagname',
174 );
175
176At last other C<HTML::Parser> options, like C<ignore_tags>, and
177C<unbroken_text>, can be passed in. Note that you should not use the
178I<event>_h options to set up parser handlers. That would confuse the
179inner logic of C<HTML::PullParser>.
180
181=item $token = $p->get_token
182
183This method will return the next I<token> found in the HTML document,
184or C<undef> at the end of the document. The token is returned as an
185array reference. The content of this array match the argspec set up
186during C<HTML::PullParser> construction.
187
188=item $p->unget_token( @tokens )
189
190If you find out you have read too many tokens you can push them back,
191so that they are returned again the next time $p->get_token is called.
192
193=back
194
195=head1 EXAMPLES
196
197The 'eg/hform' script shows how we might parse the form section of
198HTML::Documents using HTML::PullParser.
199
200=head1 SEE ALSO
201
202L<HTML::Parser>, L<HTML::TokeParser>
203
204=head1 COPYRIGHT
205
206Copyright 1998-2001 Gisle Aas.
207
208This library is free software; you can redistribute it and/or
209modify it under the same terms as Perl itself.
210
211=cut
Note: See TracBrowser for help on using the repository browser.