Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/perllib/cpan/HTML/PullParser.pm@ 14078

Last change on this file since 14078 was 14078, checked in by lh92, 17 years ago
Perl modules required for HTMLTidy
Property svn:keywords set to `Author Date Id Revision`
File size: 5.6 KB

Line
1	package HTML::PullParser;
2
3	# $Id: PullParser.pm 14078 2007-05-17 03:15:41Z lh92 $
4
5	require HTML::Parser;
6	@ISA=qw(HTML::Parser);
7	$VERSION = sprintf("%d.%02d", q$Revision: 14078 $ =~ /(\d+)\.(\d+)/);
8
9	use strict;
10	use Carp ();
11
12	sub new
13	{
14	my($class, %cnf) = @_;
15
16	# Construct argspecs for the various events
17	my %argspec;
18	for (qw(start end text declaration comment process default)) {
19	my $tmp = delete $cnf{$_};
20	next unless defined $tmp;
21	$argspec{$_} = $tmp;
22	}
23	Carp::croak("Info not collected for any events")
24	unless %argspec;
25
26	my $file = delete $cnf{file};
27	my $doc = delete $cnf{doc};
28	Carp::croak("Can't parse from both 'doc' and 'file' at the same time")
29	if defined($file) && defined($doc);
30	Carp::croak("No 'doc' or 'file' given to parse from")
31	unless defined($file) \|\| defined($doc);
32
33	# Create object
34	$cnf{api_version} = 3;
35	my $self = $class->SUPER::new(%cnf);
36
37	my $accum = $self->{pullparser_accum} = [];
38	while (my($event, $argspec) = each %argspec) {
39	$self->SUPER::handler($event => $accum, $argspec);
40	}
41
42	if (defined $doc) {
43	$self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc;
44	$self->{pullparser_str_pos} = 0;
45	}
46	else {
47	if (!ref($file) && ref(\$file) ne "GLOB") {
48	require IO::File;
49	$file = IO::File->new($file, "r") \|\| return;
50	}
51
52	$self->{pullparser_file} = $file;
53	}
54	$self;
55	}
56
57
58	sub handler
59	{
60	Carp::croak("Can't set handlers for HTML::PullParser");
61	}
62
63
64	sub get_token
65	{
66	my $self = shift;
67	while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) {
68	if (my $f = $self->{pullparser_file}) {
69	# must try to parse more from the file
70	my $buf;
71	if (read($f, $buf, 512)) {
72	$self->parse($buf);
73	} else {
74	$self->eof;
75	$self->{pullparser_eof}++;
76	delete $self->{pullparser_file};
77	}
78	}
79	elsif (my $sref = $self->{pullparser_str_ref}) {
80	# must try to parse more from the scalar
81	my $pos = $self->{pullparser_str_pos};
82	my $chunk = substr($$sref, $pos, 512);
83	$self->parse($chunk);
84	$pos += length($chunk);
85	if ($pos < length($$sref)) {
86	$self->{pullparser_str_pos} = $pos;
87	}
88	else {
89	$self->eof;
90	$self->{pullparser_eof}++;
91	delete $self->{pullparser_str_ref};
92	delete $self->{pullparser_str_pos};
93	}
94	}
95	else {
96	die;
97	}
98	}
99	shift @{$self->{pullparser_accum}};
100	}
101
102
103	sub unget_token
104	{
105	my $self = shift;
106	unshift @{$self->{pullparser_accum}}, @_;
107	$self;
108	}
109
110	1;
111
112
113	__END__
114
115	=head1 NAME
116
117	HTML::PullParser - Alternative HTML::Parser interface
118
119	=head1 SYNOPSIS
120
121	use HTML::PullParser;
122
123	$p = HTML::PullParser->new(file => "index.html",
124	start => 'event, tagname, @attr',
125	end => 'event, tagname',
126	ignore_elements => [qw(script style)],
127	) \|\| die "Can't open: $!";
128	while (my $token = $p->get_token) {
129	#...do something with $token
130	}
131
132	=head1 DESCRIPTION
133
134	The HTML::PullParser is an alternative interface to the HTML::Parser class.
135	It basically turns the HTML::Parser inside out. You associate a file
136	(or any IO::Handle object or string) with the parser at construction time and
137	then repeatedly call $parser->get_token to obtain the tags and text
138	found in the parsed document.
139
140	The following methods are provided:
141
142	=over 4
143
144	=item $p = HTML::PullParser->new( file => $file, %options )
145
146	=item $p = HTML::PullParser->new( doc => \$doc, %options )
147
148	A C<HTML::PullParser> can be made to parse from either a file or a
149	literal document based on whether the C<file> or C<doc> option is
150	passed to the parser's constructor.
151
152	The C<file> passed in can either be a file name or a file handle
153	object. If a file name is passed, and it can't be opened for reading,
154	then the constructor will return an undefined value and $! will tell
155	you why it failed. Otherwise the argument is taken to be some object
156	that the C<HTML::PullParser> can read() from when it needs more data.
157	The stream will be read() until EOF, but not closed.
158
159	A C<doc> can be passed plain or as a reference
160	to a scalar. If a reference is passed then the value of this scalar
161	should not be changed before all tokens have been extracted.
162
163	Next the information to be returned for the different token types must
164	be set up. This is done by simply associating an argspec (as defined
165	in L<HTML::Parser>) with the events you have an interest in. For
166	instance, if you want C<start> tokens to be reported as the string
167	C<'S'> followed by the tagname and the attributes you might pass an
168	C<start>-option like this:
169
170	$p = HTML::PullParser->new(
171	doc => $document_to_parse,
172	start => '"S", tagname, @attr',
173	end => '"E", tagname',
174	);
175
176	At last other C<HTML::Parser> options, like C<ignore_tags>, and
177	C<unbroken_text>, can be passed in. Note that you should not use the
178	I<event>_h options to set up parser handlers. That would confuse the
179	inner logic of C<HTML::PullParser>.
180
181	=item $token = $p->get_token
182
183	This method will return the next I<token> found in the HTML document,
184	or C<undef> at the end of the document. The token is returned as an
185	array reference. The content of this array match the argspec set up
186	during C<HTML::PullParser> construction.
187
188	=item $p->unget_token( @tokens )
189
190	If you find out you have read too many tokens you can push them back,
191	so that they are returned again the next time $p->get_token is called.
192
193	=back
194
195	=head1 EXAMPLES
196
197	The 'eg/hform' script shows how we might parse the form section of
198	HTML::Documents using HTML::PullParser.
199
200	=head1 SEE ALSO
201
202	L<HTML::Parser>, L<HTML::TokeParser>
203
204	=head1 COPYRIGHT
205
206	Copyright 1998-2001 Gisle Aas.
207
208	This library is free software; you can redistribute it and/or
209	modify it under the same terms as Perl itself.
210
211	=cut

Note: See TracBrowser for help on using the repository browser.

Download in other formats: