1 | package HTML::Parser;
|
---|
2 |
|
---|
3 | # Copyright 1996-2007, Gisle Aas.
|
---|
4 | # Copyright 1999-2000, Michael A. Chase.
|
---|
5 | #
|
---|
6 | # This library is free software; you can redistribute it and/or
|
---|
7 | # modify it under the same terms as Perl itself.
|
---|
8 |
|
---|
9 | use strict;
|
---|
10 | use vars qw($VERSION @ISA);
|
---|
11 |
|
---|
12 | $VERSION = '3.56'; # $Date: 2007-05-17 15:15:41 +1200 (Thu, 17 May 2007) $
|
---|
13 |
|
---|
14 | require HTML::Entities;
|
---|
15 |
|
---|
16 | require XSLoader;
|
---|
17 | XSLoader::load('HTML::Parser', $VERSION);
|
---|
18 |
|
---|
19 | sub new
|
---|
20 | {
|
---|
21 | my $class = shift;
|
---|
22 | my $self = bless {}, $class;
|
---|
23 | return $self->init(@_);
|
---|
24 | }
|
---|
25 |
|
---|
26 |
|
---|
27 | sub init
|
---|
28 | {
|
---|
29 | my $self = shift;
|
---|
30 | $self->_alloc_pstate;
|
---|
31 |
|
---|
32 | my %arg = @_;
|
---|
33 | my $api_version = delete $arg{api_version} || (@_ ? 3 : 2);
|
---|
34 | if ($api_version >= 4) {
|
---|
35 | require Carp;
|
---|
36 | Carp::croak("API version $api_version not supported " .
|
---|
37 | "by HTML::Parser $VERSION");
|
---|
38 | }
|
---|
39 |
|
---|
40 | if ($api_version < 3) {
|
---|
41 | # Set up method callbacks compatible with HTML-Parser-2.xx
|
---|
42 | $self->handler(text => "text", "self,text,is_cdata");
|
---|
43 | $self->handler(end => "end", "self,tagname,text");
|
---|
44 | $self->handler(process => "process", "self,token0,text");
|
---|
45 | $self->handler(start => "start",
|
---|
46 | "self,tagname,attr,attrseq,text");
|
---|
47 |
|
---|
48 | $self->handler(comment =>
|
---|
49 | sub {
|
---|
50 | my($self, $tokens) = @_;
|
---|
51 | for (@$tokens) {
|
---|
52 | $self->comment($_);
|
---|
53 | }
|
---|
54 | }, "self,tokens");
|
---|
55 |
|
---|
56 | $self->handler(declaration =>
|
---|
57 | sub {
|
---|
58 | my $self = shift;
|
---|
59 | $self->declaration(substr($_[0], 2, -1));
|
---|
60 | }, "self,text");
|
---|
61 | }
|
---|
62 |
|
---|
63 | if (my $h = delete $arg{handlers}) {
|
---|
64 | $h = {@$h} if ref($h) eq "ARRAY";
|
---|
65 | while (my($event, $cb) = each %$h) {
|
---|
66 | $self->handler($event => @$cb);
|
---|
67 | }
|
---|
68 | }
|
---|
69 |
|
---|
70 | # In the end we try to assume plain attribute or handler
|
---|
71 | while (my($option, $val) = each %arg) {
|
---|
72 | if ($option =~ /^(\w+)_h$/) {
|
---|
73 | $self->handler($1 => @$val);
|
---|
74 | }
|
---|
75 | elsif ($option =~ /^(text|start|end|process|declaration|comment)$/) {
|
---|
76 | require Carp;
|
---|
77 | Carp::croak("Bad constructor option '$option'");
|
---|
78 | }
|
---|
79 | else {
|
---|
80 | $self->$option($val);
|
---|
81 | }
|
---|
82 | }
|
---|
83 |
|
---|
84 | return $self;
|
---|
85 | }
|
---|
86 |
|
---|
87 |
|
---|
88 | sub parse_file
|
---|
89 | {
|
---|
90 | my($self, $file) = @_;
|
---|
91 | my $opened;
|
---|
92 | if (!ref($file) && ref(\$file) ne "GLOB") {
|
---|
93 | # Assume $file is a filename
|
---|
94 | local(*F);
|
---|
95 | open(F, $file) || return undef;
|
---|
96 | binmode(F); # should we? good for byte counts
|
---|
97 | $opened++;
|
---|
98 | $file = *F;
|
---|
99 | }
|
---|
100 | my $chunk = '';
|
---|
101 | while (read($file, $chunk, 512)) {
|
---|
102 | $self->parse($chunk) || last;
|
---|
103 | }
|
---|
104 | close($file) if $opened;
|
---|
105 | $self->eof;
|
---|
106 | }
|
---|
107 |
|
---|
108 |
|
---|
109 | sub netscape_buggy_comment # legacy
|
---|
110 | {
|
---|
111 | my $self = shift;
|
---|
112 | require Carp;
|
---|
113 | Carp::carp("netscape_buggy_comment() is deprecated. " .
|
---|
114 | "Please use the strict_comment() method instead");
|
---|
115 | my $old = !$self->strict_comment;
|
---|
116 | $self->strict_comment(!shift) if @_;
|
---|
117 | return $old;
|
---|
118 | }
|
---|
119 |
|
---|
120 | # set up method stubs
|
---|
121 | sub text { }
|
---|
122 | *start = \&text;
|
---|
123 | *end = \&text;
|
---|
124 | *comment = \&text;
|
---|
125 | *declaration = \&text;
|
---|
126 | *process = \&text;
|
---|
127 |
|
---|
128 | 1;
|
---|
129 |
|
---|
130 | __END__
|
---|
131 |
|
---|
132 |
|
---|
133 | =head1 NAME
|
---|
134 |
|
---|
135 | HTML::Parser - HTML parser class
|
---|
136 |
|
---|
137 | =head1 SYNOPSIS
|
---|
138 |
|
---|
139 | use HTML::Parser ();
|
---|
140 |
|
---|
141 | # Create parser object
|
---|
142 | $p = HTML::Parser->new( api_version => 3,
|
---|
143 | start_h => [\&start, "tagname, attr"],
|
---|
144 | end_h => [\&end, "tagname"],
|
---|
145 | marked_sections => 1,
|
---|
146 | );
|
---|
147 |
|
---|
148 | # Parse document text chunk by chunk
|
---|
149 | $p->parse($chunk1);
|
---|
150 | $p->parse($chunk2);
|
---|
151 | #...
|
---|
152 | $p->eof; # signal end of document
|
---|
153 |
|
---|
154 | # Parse directly from file
|
---|
155 | $p->parse_file("foo.html");
|
---|
156 | # or
|
---|
157 | open(my $fh, "<:utf8", "foo.html") || die;
|
---|
158 | $p->parse_file($fh);
|
---|
159 |
|
---|
160 | =head1 DESCRIPTION
|
---|
161 |
|
---|
162 | Objects of the C<HTML::Parser> class will recognize markup and
|
---|
163 | separate it from plain text (alias data content) in HTML
|
---|
164 | documents. As different kinds of markup and text are recognized, the
|
---|
165 | corresponding event handlers are invoked.
|
---|
166 |
|
---|
167 | C<HTML::Parser> is not a generic SGML parser. We have tried to
|
---|
168 | make it able to deal with the HTML that is actually "out there", and
|
---|
169 | it normally parses as closely as possible to the way the popular web
|
---|
170 | browsers do it instead of strictly following one of the many HTML
|
---|
171 | specifications from W3C. Where there is disagreement, there is often
|
---|
172 | an option that you can enable to get the official behaviour.
|
---|
173 |
|
---|
174 | The document to be parsed may be supplied in arbitrary chunks. This
|
---|
175 | makes on-the-fly parsing as documents are received from the network
|
---|
176 | possible.
|
---|
177 |
|
---|
178 | If event driven parsing does not feel right for your application, you
|
---|
179 | might want to use C<HTML::PullParser>. This is an C<HTML::Parser>
|
---|
180 | subclass that allows a more conventional program structure.
|
---|
181 |
|
---|
182 |
|
---|
183 | =head1 METHODS
|
---|
184 |
|
---|
185 | The following method is used to construct a new C<HTML::Parser> object:
|
---|
186 |
|
---|
187 | =over
|
---|
188 |
|
---|
189 | =item $p = HTML::Parser->new( %options_and_handlers )
|
---|
190 |
|
---|
191 | This class method creates a new C<HTML::Parser> object and
|
---|
192 | returns it. Key/value argument pairs may be provided to assign event
|
---|
193 | handlers or initialize parser options. The handlers and parser
|
---|
194 | options can also be set or modified later by the method calls described below.
|
---|
195 |
|
---|
196 | If a top level key is in the form "<event>_h" (e.g., "text_h") then it
|
---|
197 | assigns a handler to that event, otherwise it initializes a parser
|
---|
198 | option. The event handler specification value must be an array
|
---|
199 | reference. Multiple handlers may also be assigned with the 'handlers
|
---|
200 | => [%handlers]' option. See examples below.
|
---|
201 |
|
---|
202 | If new() is called without any arguments, it will create a parser that
|
---|
203 | uses callback methods compatible with version 2 of C<HTML::Parser>.
|
---|
204 | See the section on "version 2 compatibility" below for details.
|
---|
205 |
|
---|
206 | The special constructor option 'api_version => 2' can be used to
|
---|
207 | initialize version 2 callbacks while still setting other options and
|
---|
208 | handlers. The 'api_version => 3' option can be used if you don't want
|
---|
209 | to set any options and don't want to fall back to v2 compatible
|
---|
210 | mode.
|
---|
211 |
|
---|
212 | Examples:
|
---|
213 |
|
---|
214 | $p = HTML::Parser->new(api_version => 3,
|
---|
215 | text_h => [ sub {...}, "dtext" ]);
|
---|
216 |
|
---|
217 | This creates a new parser object with a text event handler subroutine
|
---|
218 | that receives the original text with general entities decoded.
|
---|
219 |
|
---|
220 | $p = HTML::Parser->new(api_version => 3,
|
---|
221 | start_h => [ 'my_start', "self,tokens" ]);
|
---|
222 |
|
---|
223 | This creates a new parser object with a start event handler method
|
---|
224 | that receives the $p and the tokens array.
|
---|
225 |
|
---|
226 | $p = HTML::Parser->new(api_version => 3,
|
---|
227 | handlers => { text => [\@array, "event,text"],
|
---|
228 | comment => [\@array, "event,text"],
|
---|
229 | });
|
---|
230 |
|
---|
231 | This creates a new parser object that stores the event type and the
|
---|
232 | original text in @array for text and comment events.
|
---|
233 |
|
---|
234 | =back
|
---|
235 |
|
---|
236 | The following methods feed the HTML document
|
---|
237 | to the C<HTML::Parser> object:
|
---|
238 |
|
---|
239 | =over
|
---|
240 |
|
---|
241 | =item $p->parse( $string )
|
---|
242 |
|
---|
243 | Parse $string as the next chunk of the HTML document. The return
|
---|
244 | value is normally a reference to the parser object (i.e. $p).
|
---|
245 | Handlers invoked should not attempt to modify the $string in-place until
|
---|
246 | $p->parse returns.
|
---|
247 |
|
---|
248 | If an invoked event handler aborts parsing by calling $p->eof, then
|
---|
249 | $p->parse() will return a FALSE value.
|
---|
250 |
|
---|
251 | =item $p->parse( $code_ref )
|
---|
252 |
|
---|
253 | If a code reference is passed as the argument to be parsed, then the
|
---|
254 | chunks to be parsed are obtained by invoking this function repeatedly.
|
---|
255 | Parsing continues until the function returns an empty (or undefined)
|
---|
256 | result. When this happens $p->eof is automatically signaled.
|
---|
257 |
|
---|
258 | Parsing will also abort if one of the event handlers calls $p->eof.
|
---|
259 |
|
---|
260 | The effect of this is the same as:
|
---|
261 |
|
---|
262 | while (1) {
|
---|
263 | my $chunk = &$code_ref();
|
---|
264 | if (!defined($chunk) || !length($chunk)) {
|
---|
265 | $p->eof;
|
---|
266 | return $p;
|
---|
267 | }
|
---|
268 | $p->parse($chunk) || return undef;
|
---|
269 | }
|
---|
270 |
|
---|
271 | But it is more efficient as this loop runs internally in XS code.
|
---|
272 |
|
---|
273 | =item $p->parse_file( $file )
|
---|
274 |
|
---|
275 | Parse text directly from a file. The $file argument can be a
|
---|
276 | filename, an open file handle, or a reference to an open file
|
---|
277 | handle.
|
---|
278 |
|
---|
279 | If $file contains a filename and the file can't be opened, then the
|
---|
280 | method returns an undefined value and $! tells why it failed.
|
---|
281 | Otherwise the return value is a reference to the parser object.
|
---|
282 |
|
---|
283 | If a file handle is passed as the $file argument, then the file will
|
---|
284 | normally be read until EOF, but not closed.
|
---|
285 |
|
---|
286 | If an invoked event handler aborts parsing by calling $p->eof,
|
---|
287 | then $p->parse_file() may not have read the entire file.
|
---|
288 |
|
---|
289 | On systems with multi-byte line terminators, the values passed for the
|
---|
290 | offset and length argspecs may be too low if parse_file() is called on
|
---|
291 | a file handle that is not in binary mode.
|
---|
292 |
|
---|
293 | If a filename is passed in, then parse_file() will open the file in
|
---|
294 | binary mode.
|
---|
295 |
|
---|
296 | =item $p->eof
|
---|
297 |
|
---|
298 | Signals the end of the HTML document. Calling the $p->eof method
|
---|
299 | outside a handler callback will flush any remaining buffered text
|
---|
300 | (which triggers the C<text> event if there is any remaining text).
|
---|
301 |
|
---|
302 | Calling $p->eof inside a handler will terminate parsing at that point
|
---|
303 | and cause $p->parse to return a FALSE value. This also terminates
|
---|
304 | parsing by $p->parse_file().
|
---|
305 |
|
---|
306 | After $p->eof has been called, the parse() and parse_file() methods
|
---|
307 | can be invoked to feed new documents with the parser object.
|
---|
308 |
|
---|
309 | The return value from eof() is a reference to the parser object.
|
---|
310 |
|
---|
311 | =back
|
---|
312 |
|
---|
313 |
|
---|
314 | Most parser options are controlled by boolean attributes.
|
---|
315 | Each boolean attribute is enabled by calling the corresponding method
|
---|
316 | with a TRUE argument and disabled with a FALSE argument. The
|
---|
317 | attribute value is left unchanged if no argument is given. The return
|
---|
318 | value from each method is the old attribute value.
|
---|
319 |
|
---|
320 | Methods that can be used to get and/or set parser options are:
|
---|
321 |
|
---|
322 | =over
|
---|
323 |
|
---|
324 | =item $p->attr_encoded
|
---|
325 |
|
---|
326 | =item $p->attr_encoded( $bool )
|
---|
327 |
|
---|
328 | By default, the C<attr> and C<@attr> argspecs will have general
|
---|
329 | entities for attribute values decoded. Enabling this attribute leaves
|
---|
330 | entities alone.
|
---|
331 |
|
---|
332 | =item $p->boolean_attribute_value( $val )
|
---|
333 |
|
---|
334 | This method sets the value reported for boolean attributes inside HTML
|
---|
335 | start tags. By default, the name of the attribute is also used as its
|
---|
336 | value. This affects the values reported for C<tokens> and C<attr>
|
---|
337 | argspecs.
|
---|
338 |
|
---|
339 | =item $p->case_sensitive
|
---|
340 |
|
---|
341 | =item $p->case_sensitive( $bool )
|
---|
342 |
|
---|
343 | By default, tagnames and attribute names are down-cased. Enabling this
|
---|
344 | attribute leaves them as found in the HTML source document.
|
---|
345 |
|
---|
346 | =item $p->closing_plaintext
|
---|
347 |
|
---|
348 | =item $p->closing_plaintext( $bool )
|
---|
349 |
|
---|
350 | By default, "plaintext" element can never be closed. Everything up to
|
---|
351 | the end of the document is parsed in CDATA mode. This historical
|
---|
352 | behaviour is what at least MSIE does. Enabling this attribute makes
|
---|
353 | closing "</plaintext>" tag effective and the parsing process will resume
|
---|
354 | after seeing this tag. This emulates gecko-based browsers.
|
---|
355 |
|
---|
356 | =item $p->empty_element_tags
|
---|
357 |
|
---|
358 | =item $p->empty_element_tags( $bool )
|
---|
359 |
|
---|
360 | By default, empty element tags are not recognized as such and the "/"
|
---|
361 | before ">" is just treated like a normal name character (unless
|
---|
362 | C<strict_names> is enabled). Enabling this attribute make
|
---|
363 | C<HTML::Parser> recognize these tags.
|
---|
364 |
|
---|
365 | Empty element tags look like start tags, but end with the character
|
---|
366 | sequence "/>" instead of ">". When recognized by C<HTML::Parser> they
|
---|
367 | cause an artificial end event in addition to the start event. The
|
---|
368 | C<text> for the artificial end event will be empty and the C<tokenpos>
|
---|
369 | array will be undefined even though the the token array will have one
|
---|
370 | element containing the tag name.
|
---|
371 |
|
---|
372 | =item $p->marked_sections
|
---|
373 |
|
---|
374 | =item $p->marked_sections( $bool )
|
---|
375 |
|
---|
376 | By default, section markings like <![CDATA[...]]> are treated like
|
---|
377 | ordinary text. When this attribute is enabled section markings are
|
---|
378 | honoured.
|
---|
379 |
|
---|
380 | There are currently no events associated with the marked section
|
---|
381 | markup, but the text can be returned as C<skipped_text>.
|
---|
382 |
|
---|
383 | =item $p->strict_comment
|
---|
384 |
|
---|
385 | =item $p->strict_comment( $bool )
|
---|
386 |
|
---|
387 | By default, comments are terminated by the first occurrence of "-->".
|
---|
388 | This is the behaviour of most popular browsers (like Mozilla, Opera and
|
---|
389 | MSIE), but it is not correct according to the official HTML
|
---|
390 | standard. Officially, you need an even number of "--" tokens before
|
---|
391 | the closing ">" is recognized and there may not be anything but
|
---|
392 | whitespace between an even and an odd "--".
|
---|
393 |
|
---|
394 | The official behaviour is enabled by enabling this attribute.
|
---|
395 |
|
---|
396 | Enabling of 'strict_comment' also disables recognizing these forms as
|
---|
397 | comments:
|
---|
398 |
|
---|
399 | </ comment>
|
---|
400 | <! comment>
|
---|
401 |
|
---|
402 |
|
---|
403 | =item $p->strict_end
|
---|
404 |
|
---|
405 | =item $p->strict_end( $bool )
|
---|
406 |
|
---|
407 | By default, attributes and other junk are allowed to be present on end tags in a
|
---|
408 | manner that emulates MSIE's behaviour.
|
---|
409 |
|
---|
410 | The official behaviour is enabled with this attribute. If enabled,
|
---|
411 | only whitespace is allowed between the tagname and the final ">".
|
---|
412 |
|
---|
413 | =item $p->strict_names
|
---|
414 |
|
---|
415 | =item $p->strict_names( $bool )
|
---|
416 |
|
---|
417 | By default, almost anything is allowed in tag and attribute names.
|
---|
418 | This is the behaviour of most popular browsers and allows us to parse
|
---|
419 | some broken tags with invalid attribute values like:
|
---|
420 |
|
---|
421 | <IMG SRC=newprevlstGr.gif ALT=[PREV LIST] BORDER=0>
|
---|
422 |
|
---|
423 | By default, "LIST]" is parsed as a boolean attribute, not as
|
---|
424 | part of the ALT value as was clearly intended. This is also what
|
---|
425 | Mozilla sees.
|
---|
426 |
|
---|
427 | The official behaviour is enabled by enabling this attribute. If
|
---|
428 | enabled, it will cause the tag above to be reported as text
|
---|
429 | since "LIST]" is not a legal attribute name.
|
---|
430 |
|
---|
431 | =item $p->unbroken_text
|
---|
432 |
|
---|
433 | =item $p->unbroken_text( $bool )
|
---|
434 |
|
---|
435 | By default, blocks of text are given to the text handler as soon as
|
---|
436 | possible (but the parser takes care always to break text at a
|
---|
437 | boundary between whitespace and non-whitespace so single words and
|
---|
438 | entities can always be decoded safely). This might create breaks that
|
---|
439 | make it hard to do transformations on the text. When this attribute is
|
---|
440 | enabled, blocks of text are always reported in one piece. This will
|
---|
441 | delay the text event until the following (non-text) event has been
|
---|
442 | recognized by the parser.
|
---|
443 |
|
---|
444 | Note that the C<offset> argspec will give you the offset of the first
|
---|
445 | segment of text and C<length> is the combined length of the segments.
|
---|
446 | Since there might be ignored tags in between, these numbers can't be
|
---|
447 | used to directly index in the original document file.
|
---|
448 |
|
---|
449 | =item $p->utf8_mode
|
---|
450 |
|
---|
451 | =item $p->utf8_mode( $bool )
|
---|
452 |
|
---|
453 | Enable this option when parsing raw undecoded UTF-8. This tells the
|
---|
454 | parser that the entities expanded for strings reported by C<attr>,
|
---|
455 | C<@attr> and C<dtext> should be expanded as decoded UTF-8 so they end
|
---|
456 | up compatible with the surrounding text.
|
---|
457 |
|
---|
458 | If C<utf8_mode> is enabled then it is an error to pass strings
|
---|
459 | containing characters with code above 255 to the parse() method, and
|
---|
460 | the parse() method will croak if you try.
|
---|
461 |
|
---|
462 | Example: The Unicode character "\x{2665}" is "\xE2\x99\xA5" when UTF-8
|
---|
463 | encoded. The character can also be represented by the entity
|
---|
464 | "♥" or "♥". If we feed the parser:
|
---|
465 |
|
---|
466 | $p->parse("\xE2\x99\xA5♥");
|
---|
467 |
|
---|
468 | then C<dtext> will be reported as "\xE2\x99\xA5\x{2665}" without
|
---|
469 | C<utf8_mode> enabled, but as "\xE2\x99\xA5\xE2\x99\xA5" when enabled.
|
---|
470 | The later string is what you want.
|
---|
471 |
|
---|
472 | This option is only available with perl-5.8 or better.
|
---|
473 |
|
---|
474 | =item $p->xml_mode
|
---|
475 |
|
---|
476 | =item $p->xml_mode( $bool )
|
---|
477 |
|
---|
478 | Enabling this attribute changes the parser to allow some XML
|
---|
479 | constructs. This enables the behaviour controlled by individually by
|
---|
480 | the C<case_sensitive>, C<empty_element_tags>, C<strict_names> and
|
---|
481 | C<xml_pic> attributes and also suppresses special treatment of
|
---|
482 | elements that are parsed as CDATA for HTML.
|
---|
483 |
|
---|
484 | =item $p->xml_pic
|
---|
485 |
|
---|
486 | =item $p->xml_pic( $bool )
|
---|
487 |
|
---|
488 | By default, I<processing instructions> are terminated by ">". When
|
---|
489 | this attribute is enabled, processing instructions are terminated by
|
---|
490 | "?>" instead.
|
---|
491 |
|
---|
492 | =back
|
---|
493 |
|
---|
494 | As markup and text is recognized, handlers are invoked. The following
|
---|
495 | method is used to set up handlers for different events:
|
---|
496 |
|
---|
497 | =over
|
---|
498 |
|
---|
499 | =item $p->handler( event => \&subroutine, $argspec )
|
---|
500 |
|
---|
501 | =item $p->handler( event => $method_name, $argspec )
|
---|
502 |
|
---|
503 | =item $p->handler( event => \@accum, $argspec )
|
---|
504 |
|
---|
505 | =item $p->handler( event => "" );
|
---|
506 |
|
---|
507 | =item $p->handler( event => undef );
|
---|
508 |
|
---|
509 | =item $p->handler( event );
|
---|
510 |
|
---|
511 | This method assigns a subroutine, method, or array to handle an event.
|
---|
512 |
|
---|
513 | Event is one of C<text>, C<start>, C<end>, C<declaration>, C<comment>,
|
---|
514 | C<process>, C<start_document>, C<end_document> or C<default>.
|
---|
515 |
|
---|
516 | The C<\&subroutine> is a reference to a subroutine which is called to handle
|
---|
517 | the event.
|
---|
518 |
|
---|
519 | The C<$method_name> is the name of a method of $p which is called to handle
|
---|
520 | the event.
|
---|
521 |
|
---|
522 | The C<@accum> is an array that will hold the event information as
|
---|
523 | sub-arrays.
|
---|
524 |
|
---|
525 | If the second argument is "", the event is ignored.
|
---|
526 | If it is undef, the default handler is invoked for the event.
|
---|
527 |
|
---|
528 | The C<$argspec> is a string that describes the information to be reported
|
---|
529 | for the event. Any requested information that does not apply to a
|
---|
530 | specific event is passed as C<undef>. If argspec is omitted, then it
|
---|
531 | is left unchanged.
|
---|
532 |
|
---|
533 | The return value from $p->handler is the old callback routine or a
|
---|
534 | reference to the accumulator array.
|
---|
535 |
|
---|
536 | Any return values from handler callback routines/methods are always
|
---|
537 | ignored. A handler callback can request parsing to be aborted by
|
---|
538 | invoking the $p->eof method. A handler callback is not allowed to
|
---|
539 | invoke the $p->parse() or $p->parse_file() method. An exception will
|
---|
540 | be raised if it tries.
|
---|
541 |
|
---|
542 | Examples:
|
---|
543 |
|
---|
544 | $p->handler(start => "start", 'self, attr, attrseq, text' );
|
---|
545 |
|
---|
546 | This causes the "start" method of object $p to be called for 'start' events.
|
---|
547 | The callback signature is $p->start(\%attr, \@attr_seq, $text).
|
---|
548 |
|
---|
549 | $p->handler(start => \&start, 'attr, attrseq, text' );
|
---|
550 |
|
---|
551 | This causes subroutine start() to be called for 'start' events.
|
---|
552 | The callback signature is start(\%attr, \@attr_seq, $text).
|
---|
553 |
|
---|
554 | $p->handler(start => \@accum, '"S", attr, attrseq, text' );
|
---|
555 |
|
---|
556 | This causes 'start' event information to be saved in @accum.
|
---|
557 | The array elements will be ['S', \%attr, \@attr_seq, $text].
|
---|
558 |
|
---|
559 | $p->handler(start => "");
|
---|
560 |
|
---|
561 | This causes 'start' events to be ignored. It also suppresses
|
---|
562 | invocations of any default handler for start events. It is in most
|
---|
563 | cases equivalent to $p->handler(start => sub {}), but is more
|
---|
564 | efficient. It is different from the empty-sub-handler in that
|
---|
565 | C<skipped_text> is not reset by it.
|
---|
566 |
|
---|
567 | $p->handler(start => undef);
|
---|
568 |
|
---|
569 | This causes no handler to be associated with start events.
|
---|
570 | If there is a default handler it will be invoked.
|
---|
571 |
|
---|
572 | =back
|
---|
573 |
|
---|
574 | Filters based on tags can be set up to limit the number of events
|
---|
575 | reported. The main bottleneck during parsing is often the huge number
|
---|
576 | of callbacks made from the parser. Applying filters can improve
|
---|
577 | performance significantly.
|
---|
578 |
|
---|
579 | The following methods control filters:
|
---|
580 |
|
---|
581 | =over
|
---|
582 |
|
---|
583 | =item $p->ignore_elements( @tags )
|
---|
584 |
|
---|
585 | Both the C<start> event and the C<end> event as well as any events that
|
---|
586 | would be reported in between are suppressed. The ignored elements can
|
---|
587 | contain nested occurrences of itself. Example:
|
---|
588 |
|
---|
589 | $p->ignore_elements(qw(script style));
|
---|
590 |
|
---|
591 | The C<script> and C<style> tags will always nest properly since their
|
---|
592 | content is parsed in CDATA mode. For most other tags
|
---|
593 | C<ignore_elements> must be used with caution since HTML is often not
|
---|
594 | I<well formed>.
|
---|
595 |
|
---|
596 | =item $p->ignore_tags( @tags )
|
---|
597 |
|
---|
598 | Any C<start> and C<end> events involving any of the tags given are
|
---|
599 | suppressed. To reset the filter (i.e. don't suppress any C<start> and
|
---|
600 | C<end> events), call C<ignore_tags> without an argument.
|
---|
601 |
|
---|
602 | =item $p->report_tags( @tags )
|
---|
603 |
|
---|
604 | Any C<start> and C<end> events involving any of the tags I<not> given
|
---|
605 | are suppressed. To reset the filter (i.e. report all C<start> and
|
---|
606 | C<end> events), call C<report_tags> without an argument.
|
---|
607 |
|
---|
608 | =back
|
---|
609 |
|
---|
610 | Internally, the system has two filter lists, one for C<report_tags>
|
---|
611 | and one for C<ignore_tags>, and both filters are applied. This
|
---|
612 | effectively gives C<ignore_tags> precedence over C<report_tags>.
|
---|
613 |
|
---|
614 | Examples:
|
---|
615 |
|
---|
616 | $p->ignore_tags(qw(style));
|
---|
617 | $p->report_tags(qw(script style));
|
---|
618 |
|
---|
619 | results in only C<script> events being reported.
|
---|
620 |
|
---|
621 | =head2 Argspec
|
---|
622 |
|
---|
623 | Argspec is a string containing a comma-separated list that describes
|
---|
624 | the information reported by the event. The following argspec
|
---|
625 | identifier names can be used:
|
---|
626 |
|
---|
627 | =over
|
---|
628 |
|
---|
629 | =item C<attr>
|
---|
630 |
|
---|
631 | Attr causes a reference to a hash of attribute name/value pairs to be
|
---|
632 | passed.
|
---|
633 |
|
---|
634 | Boolean attributes' values are either the value set by
|
---|
635 | $p->boolean_attribute_value, or the attribute name if no value has been
|
---|
636 | set by $p->boolean_attribute_value.
|
---|
637 |
|
---|
638 | This passes undef except for C<start> events.
|
---|
639 |
|
---|
640 | Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute
|
---|
641 | names are forced to lower case.
|
---|
642 |
|
---|
643 | General entities are decoded in the attribute values and
|
---|
644 | one layer of matching quotes enclosing the attribute values is removed.
|
---|
645 |
|
---|
646 | The Unicode character set is assumed for entity decoding. With Perl
|
---|
647 | version 5.6 or earlier only the Latin-1 range is supported, and
|
---|
648 | entities for characters outside the range 0..255 are left unchanged.
|
---|
649 |
|
---|
650 | =item C<@attr>
|
---|
651 |
|
---|
652 | Basically the same as C<attr>, but keys and values are passed as
|
---|
653 | individual arguments and the original sequence of the attributes is
|
---|
654 | kept. The parameters passed will be the same as the @attr calculated
|
---|
655 | here:
|
---|
656 |
|
---|
657 | @attr = map { $_ => $attr->{$_} } @$attrseq;
|
---|
658 |
|
---|
659 | assuming $attr and $attrseq here are the hash and array passed as the
|
---|
660 | result of C<attr> and C<attrseq> argspecs.
|
---|
661 |
|
---|
662 | This passes no values for events besides C<start>.
|
---|
663 |
|
---|
664 | =item C<attrseq>
|
---|
665 |
|
---|
666 | Attrseq causes a reference to an array of attribute names to be
|
---|
667 | passed. This can be useful if you want to walk the C<attr> hash in
|
---|
668 | the original sequence.
|
---|
669 |
|
---|
670 | This passes undef except for C<start> events.
|
---|
671 |
|
---|
672 | Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute
|
---|
673 | names are forced to lower case.
|
---|
674 |
|
---|
675 | =item C<column>
|
---|
676 |
|
---|
677 | Column causes the column number of the start of the event to be passed.
|
---|
678 | The first column on a line is 0.
|
---|
679 |
|
---|
680 | =item C<dtext>
|
---|
681 |
|
---|
682 | Dtext causes the decoded text to be passed. General entities are
|
---|
683 | automatically decoded unless the event was inside a CDATA section or
|
---|
684 | was between literal start and end tags (C<script>, C<style>,
|
---|
685 | C<xmp>, and C<plaintext>).
|
---|
686 |
|
---|
687 | The Unicode character set is assumed for entity decoding. With Perl
|
---|
688 | version 5.6 or earlier only the Latin-1 range is supported, and
|
---|
689 | entities for characters outside the range 0..255 are left unchanged.
|
---|
690 |
|
---|
691 | This passes undef except for C<text> events.
|
---|
692 |
|
---|
693 | =item C<event>
|
---|
694 |
|
---|
695 | Event causes the event name to be passed.
|
---|
696 |
|
---|
697 | The event name is one of C<text>, C<start>, C<end>, C<declaration>,
|
---|
698 | C<comment>, C<process>, C<start_document> or C<end_document>.
|
---|
699 |
|
---|
700 | =item C<is_cdata>
|
---|
701 |
|
---|
702 | Is_cdata causes a TRUE value to be passed if the event is inside a CDATA
|
---|
703 | section or between literal start and end tags (C<script>,
|
---|
704 | C<style>, C<xmp>, and C<plaintext>).
|
---|
705 |
|
---|
706 | if the flag is FALSE for a text event, then you should normally
|
---|
707 | either use C<dtext> or decode the entities yourself before the text is
|
---|
708 | processed further.
|
---|
709 |
|
---|
710 | =item C<length>
|
---|
711 |
|
---|
712 | Length causes the number of bytes of the source text of the event to
|
---|
713 | be passed.
|
---|
714 |
|
---|
715 | =item C<line>
|
---|
716 |
|
---|
717 | Line causes the line number of the start of the event to be passed.
|
---|
718 | The first line in the document is 1. Line counting doesn't start
|
---|
719 | until at least one handler requests this value to be reported.
|
---|
720 |
|
---|
721 | =item C<offset>
|
---|
722 |
|
---|
723 | Offset causes the byte position in the HTML document of the start of
|
---|
724 | the event to be passed. The first byte in the document has offset 0.
|
---|
725 |
|
---|
726 | =item C<offset_end>
|
---|
727 |
|
---|
728 | Offset_end causes the byte position in the HTML document of the end of
|
---|
729 | the event to be passed. This is the same as C<offset> + C<length>.
|
---|
730 |
|
---|
731 | =item C<self>
|
---|
732 |
|
---|
733 | Self causes the current object to be passed to the handler. If the
|
---|
734 | handler is a method, this must be the first element in the argspec.
|
---|
735 |
|
---|
736 | An alternative to passing self as an argspec is to register closures
|
---|
737 | that capture $self by themselves as handlers. Unfortunately this
|
---|
738 | creates circular references which prevent the HTML::Parser object
|
---|
739 | from being garbage collected. Using the C<self> argspec avoids this
|
---|
740 | problem.
|
---|
741 |
|
---|
742 | =item C<skipped_text>
|
---|
743 |
|
---|
744 | Skipped_text returns the concatenated text of all the events that have
|
---|
745 | been skipped since the last time an event was reported. Events might
|
---|
746 | be skipped because no handler is registered for them or because some
|
---|
747 | filter applies. Skipped text also includes marked section markup,
|
---|
748 | since there are no events that can catch it.
|
---|
749 |
|
---|
750 | If an C<"">-handler is registered for an event, then the text for this
|
---|
751 | event is not included in C<skipped_text>. Skipped text both before
|
---|
752 | and after the C<"">-event is included in the next reported
|
---|
753 | C<skipped_text>.
|
---|
754 |
|
---|
755 | =item C<tag>
|
---|
756 |
|
---|
757 | Same as C<tagname>, but prefixed with "/" if it belongs to an C<end>
|
---|
758 | event and "!" for a declaration. The C<tag> does not have any prefix
|
---|
759 | for C<start> events, and is in this case identical to C<tagname>.
|
---|
760 |
|
---|
761 | =item C<tagname>
|
---|
762 |
|
---|
763 | This is the element name (or I<generic identifier> in SGML jargon) for
|
---|
764 | start and end tags. Since HTML is case insensitive, this name is
|
---|
765 | forced to lower case to ease string matching.
|
---|
766 |
|
---|
767 | Since XML is case sensitive, the tagname case is not changed when
|
---|
768 | C<xml_mode> is enabled. The same happens if the C<case_sensitive> attribute
|
---|
769 | is set.
|
---|
770 |
|
---|
771 | The declaration type of declaration elements is also passed as a tagname,
|
---|
772 | even if that is a bit strange.
|
---|
773 | In fact, in the current implementation tagname is
|
---|
774 | identical to C<token0> except that the name may be forced to lower case.
|
---|
775 |
|
---|
776 | =item C<token0>
|
---|
777 |
|
---|
778 | Token0 causes the original text of the first token string to be
|
---|
779 | passed. This should always be the same as $tokens->[0].
|
---|
780 |
|
---|
781 | For C<declaration> events, this is the declaration type.
|
---|
782 |
|
---|
783 | For C<start> and C<end> events, this is the tag name.
|
---|
784 |
|
---|
785 | For C<process> and non-strict C<comment> events, this is everything
|
---|
786 | inside the tag.
|
---|
787 |
|
---|
788 | This passes undef if there are no tokens in the event.
|
---|
789 |
|
---|
790 | =item C<tokenpos>
|
---|
791 |
|
---|
792 | Tokenpos causes a reference to an array of token positions to be
|
---|
793 | passed. For each string that appears in C<tokens>, this array
|
---|
794 | contains two numbers. The first number is the offset of the start of
|
---|
795 | the token in the original C<text> and the second number is the length
|
---|
796 | of the token.
|
---|
797 |
|
---|
798 | Boolean attributes in a C<start> event will have (0,0) for the
|
---|
799 | attribute value offset and length.
|
---|
800 |
|
---|
801 | This passes undef if there are no tokens in the event (e.g., C<text>)
|
---|
802 | and for artificial C<end> events triggered by empty element tags.
|
---|
803 |
|
---|
804 | If you are using these offsets and lengths to modify C<text>, you
|
---|
805 | should either work from right to left, or be very careful to calculate
|
---|
806 | the changes to the offsets.
|
---|
807 |
|
---|
808 | =item C<tokens>
|
---|
809 |
|
---|
810 | Tokens causes a reference to an array of token strings to be passed.
|
---|
811 | The strings are exactly as they were found in the original text,
|
---|
812 | no decoding or case changes are applied.
|
---|
813 |
|
---|
814 | For C<declaration> events, the array contains each word, comment, and
|
---|
815 | delimited string starting with the declaration type.
|
---|
816 |
|
---|
817 | For C<comment> events, this contains each sub-comment. If
|
---|
818 | $p->strict_comments is disabled, there will be only one sub-comment.
|
---|
819 |
|
---|
820 | For C<start> events, this contains the original tag name followed by
|
---|
821 | the attribute name/value pairs. The values of boolean attributes will
|
---|
822 | be either the value set by $p->boolean_attribute_value, or the
|
---|
823 | attribute name if no value has been set by
|
---|
824 | $p->boolean_attribute_value.
|
---|
825 |
|
---|
826 | For C<end> events, this contains the original tag name (always one token).
|
---|
827 |
|
---|
828 | For C<process> events, this contains the process instructions (always one
|
---|
829 | token).
|
---|
830 |
|
---|
831 | This passes C<undef> for C<text> events.
|
---|
832 |
|
---|
833 | =item C<text>
|
---|
834 |
|
---|
835 | Text causes the source text (including markup element delimiters) to be
|
---|
836 | passed.
|
---|
837 |
|
---|
838 | =item C<undef>
|
---|
839 |
|
---|
840 | Pass an undefined value. Useful as padding where the same handler
|
---|
841 | routine is registered for multiple events.
|
---|
842 |
|
---|
843 | =item C<'...'>
|
---|
844 |
|
---|
845 | A literal string of 0 to 255 characters enclosed
|
---|
846 | in single (') or double (") quotes is passed as entered.
|
---|
847 |
|
---|
848 | =back
|
---|
849 |
|
---|
850 | The whole argspec string can be wrapped up in C<'@{...}'> to signal
|
---|
851 | that the resulting event array should be flattened. This only makes a
|
---|
852 | difference if an array reference is used as the handler target.
|
---|
853 | Consider this example:
|
---|
854 |
|
---|
855 | $p->handler(text => [], 'text');
|
---|
856 | $p->handler(text => [], '@{text}']);
|
---|
857 |
|
---|
858 | With two text events; C<"foo">, C<"bar">; then the first example will end
|
---|
859 | up with [["foo"], ["bar"]] and the second with ["foo", "bar"] in
|
---|
860 | the handler target array.
|
---|
861 |
|
---|
862 |
|
---|
863 | =head2 Events
|
---|
864 |
|
---|
865 | Handlers for the following events can be registered:
|
---|
866 |
|
---|
867 | =over
|
---|
868 |
|
---|
869 | =item C<comment>
|
---|
870 |
|
---|
871 | This event is triggered when a markup comment is recognized.
|
---|
872 |
|
---|
873 | Example:
|
---|
874 |
|
---|
875 | <!-- This is a comment -- -- So is this -->
|
---|
876 |
|
---|
877 | =item C<declaration>
|
---|
878 |
|
---|
879 | This event is triggered when a I<markup declaration> is recognized.
|
---|
880 |
|
---|
881 | For typical HTML documents, the only declaration you are
|
---|
882 | likely to find is <!DOCTYPE ...>.
|
---|
883 |
|
---|
884 | Example:
|
---|
885 |
|
---|
886 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
|
---|
887 | "http://www.w3.org/TR/html40/strict.dtd">
|
---|
888 |
|
---|
889 | DTDs inside <!DOCTYPE ...> will confuse HTML::Parser.
|
---|
890 |
|
---|
891 | =item C<default>
|
---|
892 |
|
---|
893 | This event is triggered for events that do not have a specific
|
---|
894 | handler. You can set up a handler for this event to catch stuff you
|
---|
895 | did not want to catch explicitly.
|
---|
896 |
|
---|
897 | =item C<end>
|
---|
898 |
|
---|
899 | This event is triggered when an end tag is recognized.
|
---|
900 |
|
---|
901 | Example:
|
---|
902 |
|
---|
903 | </A>
|
---|
904 |
|
---|
905 | =item C<end_document>
|
---|
906 |
|
---|
907 | This event is triggered when $p->eof is called and after any remaining
|
---|
908 | text is flushed. There is no document text associated with this event.
|
---|
909 |
|
---|
910 | =item C<process>
|
---|
911 |
|
---|
912 | This event is triggered when a processing instructions markup is
|
---|
913 | recognized.
|
---|
914 |
|
---|
915 | The format and content of processing instructions are system and
|
---|
916 | application dependent.
|
---|
917 |
|
---|
918 | Examples:
|
---|
919 |
|
---|
920 | <? HTML processing instructions >
|
---|
921 | <? XML processing instructions ?>
|
---|
922 |
|
---|
923 | =item C<start>
|
---|
924 |
|
---|
925 | This event is triggered when a start tag is recognized.
|
---|
926 |
|
---|
927 | Example:
|
---|
928 |
|
---|
929 | <A HREF="http://www.perl.com/">
|
---|
930 |
|
---|
931 | =item C<start_document>
|
---|
932 |
|
---|
933 | This event is triggered before any other events for a new document. A
|
---|
934 | handler for it can be used to initialize stuff. There is no document
|
---|
935 | text associated with this event.
|
---|
936 |
|
---|
937 | =item C<text>
|
---|
938 |
|
---|
939 | This event is triggered when plain text (characters) is recognized.
|
---|
940 | The text may contain multiple lines. A sequence of text may be broken
|
---|
941 | between several text events unless $p->unbroken_text is enabled.
|
---|
942 |
|
---|
943 | The parser will make sure that it does not break a word or a sequence
|
---|
944 | of whitespace between two text events.
|
---|
945 |
|
---|
946 | =back
|
---|
947 |
|
---|
948 | =head2 Unicode
|
---|
949 |
|
---|
950 | The C<HTML::Parser> can parse Unicode strings when running under
|
---|
951 | perl-5.8 or better. If Unicode is passed to $p->parse() then chunks
|
---|
952 | of Unicode will be reported to the handlers. The offset and length
|
---|
953 | argspecs will also report their position in terms of characters.
|
---|
954 |
|
---|
955 | It is safe to parse raw undecoded UTF-8 if you either avoid decoding
|
---|
956 | entities and make sure to not use I<argspecs> that do, or enable the
|
---|
957 | C<utf8_mode> for the parser. Parsing of undecoded UTF-8 might be
|
---|
958 | useful when parsing from a file where you need the reported offsets
|
---|
959 | and lengths to match the byte offsets in the file.
|
---|
960 |
|
---|
961 | If a filename is passed to $p->parse_file() then the file will be read
|
---|
962 | in binary mode. This will be fine if the file contains only ASCII or
|
---|
963 | Latin-1 characters. If the file contains UTF-8 encoded text then care
|
---|
964 | must be taken when decoding entities as described in the previous
|
---|
965 | paragraph, but better is to open the file with the UTF-8 layer so that
|
---|
966 | it is decoded properly:
|
---|
967 |
|
---|
968 | open(my $fh, "<:utf8", "index.html") || die "...: $!";
|
---|
969 | $p->parse_file($fh);
|
---|
970 |
|
---|
971 | If the file contains text encoded in a charset besides ASCII, Latin-1
|
---|
972 | or UTF-8 then decoding will always be needed.
|
---|
973 |
|
---|
974 | =head1 VERSION 2 COMPATIBILITY
|
---|
975 |
|
---|
976 | When an C<HTML::Parser> object is constructed with no arguments, a set
|
---|
977 | of handlers is automatically provided that is compatible with the old
|
---|
978 | HTML::Parser version 2 callback methods.
|
---|
979 |
|
---|
980 | This is equivalent to the following method calls:
|
---|
981 |
|
---|
982 | $p->handler(start => "start", "self, tagname, attr, attrseq, text");
|
---|
983 | $p->handler(end => "end", "self, tagname, text");
|
---|
984 | $p->handler(text => "text", "self, text, is_cdata");
|
---|
985 | $p->handler(process => "process", "self, token0, text");
|
---|
986 | $p->handler(comment =>
|
---|
987 | sub {
|
---|
988 | my($self, $tokens) = @_;
|
---|
989 | for (@$tokens) {$self->comment($_);}},
|
---|
990 | "self, tokens");
|
---|
991 | $p->handler(declaration =>
|
---|
992 | sub {
|
---|
993 | my $self = shift;
|
---|
994 | $self->declaration(substr($_[0], 2, -1));},
|
---|
995 | "self, text");
|
---|
996 |
|
---|
997 | Setting up these handlers can also be requested with the "api_version =>
|
---|
998 | 2" constructor option.
|
---|
999 |
|
---|
1000 | =head1 SUBCLASSING
|
---|
1001 |
|
---|
1002 | The C<HTML::Parser> class is subclassable. Parser objects are plain
|
---|
1003 | hashes and C<HTML::Parser> reserves only hash keys that start with
|
---|
1004 | "_hparser". The parser state can be set up by invoking the init()
|
---|
1005 | method, which takes the same arguments as new().
|
---|
1006 |
|
---|
1007 | =head1 EXAMPLES
|
---|
1008 |
|
---|
1009 | The first simple example shows how you might strip out comments from
|
---|
1010 | an HTML document. We achieve this by setting up a comment handler that
|
---|
1011 | does nothing and a default handler that will print out anything else:
|
---|
1012 |
|
---|
1013 | use HTML::Parser;
|
---|
1014 | HTML::Parser->new(default_h => [sub { print shift }, 'text'],
|
---|
1015 | comment_h => [""],
|
---|
1016 | )->parse_file(shift || die) || die $!;
|
---|
1017 |
|
---|
1018 | An alternative implementation is:
|
---|
1019 |
|
---|
1020 | use HTML::Parser;
|
---|
1021 | HTML::Parser->new(end_document_h => [sub { print shift },
|
---|
1022 | 'skipped_text'],
|
---|
1023 | comment_h => [""],
|
---|
1024 | )->parse_file(shift || die) || die $!;
|
---|
1025 |
|
---|
1026 | This will in most cases be much more efficient since only a single
|
---|
1027 | callback will be made.
|
---|
1028 |
|
---|
1029 | The next example prints out the text that is inside the <title>
|
---|
1030 | element of an HTML document. Here we start by setting up a start
|
---|
1031 | handler. When it sees the title start tag it enables a text handler
|
---|
1032 | that prints any text found and an end handler that will terminate
|
---|
1033 | parsing as soon as the title end tag is seen:
|
---|
1034 |
|
---|
1035 | use HTML::Parser ();
|
---|
1036 |
|
---|
1037 | sub start_handler
|
---|
1038 | {
|
---|
1039 | return if shift ne "title";
|
---|
1040 | my $self = shift;
|
---|
1041 | $self->handler(text => sub { print shift }, "dtext");
|
---|
1042 | $self->handler(end => sub { shift->eof if shift eq "title"; },
|
---|
1043 | "tagname,self");
|
---|
1044 | }
|
---|
1045 |
|
---|
1046 | my $p = HTML::Parser->new(api_version => 3);
|
---|
1047 | $p->handler( start => \&start_handler, "tagname,self");
|
---|
1048 | $p->parse_file(shift || die) || die $!;
|
---|
1049 | print "\n";
|
---|
1050 |
|
---|
1051 | More examples are found in the F<eg/> directory of the C<HTML-Parser>
|
---|
1052 | distribution: the program C<hrefsub> shows how you can edit all links
|
---|
1053 | found in a document; the program C<htextsub> shows how to edit the text only; the
|
---|
1054 | program C<hstrip> shows how you can strip out certain tags/elements
|
---|
1055 | and/or attributes; and the program C<htext> show how to obtain the
|
---|
1056 | plain text, but not any script/style content.
|
---|
1057 |
|
---|
1058 | You can browse the F<eg/> directory online from the I<[Browse]> link on
|
---|
1059 | the http://search.cpan.org/~gaas/HTML-Parser/ page.
|
---|
1060 |
|
---|
1061 | =head1 BUGS
|
---|
1062 |
|
---|
1063 | The <style> and <script> sections do not end with the first "</", but
|
---|
1064 | need the complete corresponding end tag. The standard behaviour is
|
---|
1065 | not really practical.
|
---|
1066 |
|
---|
1067 | When the I<strict_comment> option is enabled, we still recognize
|
---|
1068 | comments where there is something other than whitespace between even
|
---|
1069 | and odd "--" markers.
|
---|
1070 |
|
---|
1071 | Once $p->boolean_attribute_value has been set, there is no way to
|
---|
1072 | restore the default behaviour.
|
---|
1073 |
|
---|
1074 | There is currently no way to get both quote characters
|
---|
1075 | into the same literal argspec.
|
---|
1076 |
|
---|
1077 | Empty tags, e.g. "<>" and "</>", are not recognized. SGML allows them
|
---|
1078 | to repeat the previous start tag or close the previous start tag
|
---|
1079 | respectively.
|
---|
1080 |
|
---|
1081 | NET tags, e.g. "code/.../" are not recognized. This is SGML
|
---|
1082 | shorthand for "<code>...</code>".
|
---|
1083 |
|
---|
1084 | Unclosed start or end tags, e.g. "<tt<b>...</b</tt>" are not
|
---|
1085 | recognized.
|
---|
1086 |
|
---|
1087 | =head1 DIAGNOSTICS
|
---|
1088 |
|
---|
1089 | The following messages may be produced by HTML::Parser. The notation
|
---|
1090 | in this listing is the same as used in L<perldiag>:
|
---|
1091 |
|
---|
1092 | =over
|
---|
1093 |
|
---|
1094 | =item Not a reference to a hash
|
---|
1095 |
|
---|
1096 | (F) The object blessed into or subclassed from HTML::Parser is not a
|
---|
1097 | hash as required by the HTML::Parser methods.
|
---|
1098 |
|
---|
1099 | =item Bad signature in parser state object at %p
|
---|
1100 |
|
---|
1101 | (F) The _hparser_xs_state element does not refer to a valid state structure.
|
---|
1102 | Something must have changed the internal value
|
---|
1103 | stored in this hash element, or the memory has been overwritten.
|
---|
1104 |
|
---|
1105 | =item _hparser_xs_state element is not a reference
|
---|
1106 |
|
---|
1107 | (F) The _hparser_xs_state element has been destroyed.
|
---|
1108 |
|
---|
1109 | =item Can't find '_hparser_xs_state' element in HTML::Parser hash
|
---|
1110 |
|
---|
1111 | (F) The _hparser_xs_state element is missing from the parser hash.
|
---|
1112 | It was either deleted, or not created when the object was created.
|
---|
1113 |
|
---|
1114 | =item API version %s not supported by HTML::Parser %s
|
---|
1115 |
|
---|
1116 | (F) The constructor option 'api_version' with an argument greater than
|
---|
1117 | or equal to 4 is reserved for future extensions.
|
---|
1118 |
|
---|
1119 | =item Bad constructor option '%s'
|
---|
1120 |
|
---|
1121 | (F) An unknown constructor option key was passed to the new() or
|
---|
1122 | init() methods.
|
---|
1123 |
|
---|
1124 | =item Parse loop not allowed
|
---|
1125 |
|
---|
1126 | (F) A handler invoked the parse() or parse_file() method.
|
---|
1127 | This is not permitted.
|
---|
1128 |
|
---|
1129 | =item marked sections not supported
|
---|
1130 |
|
---|
1131 | (F) The $p->marked_sections() method was invoked in a HTML::Parser
|
---|
1132 | module that was compiled without support for marked sections.
|
---|
1133 |
|
---|
1134 | =item Unknown boolean attribute (%d)
|
---|
1135 |
|
---|
1136 | (F) Something is wrong with the internal logic that set up aliases for
|
---|
1137 | boolean attributes.
|
---|
1138 |
|
---|
1139 | =item Only code or array references allowed as handler
|
---|
1140 |
|
---|
1141 | (F) The second argument for $p->handler must be either a subroutine
|
---|
1142 | reference, then name of a subroutine or method, or a reference to an
|
---|
1143 | array.
|
---|
1144 |
|
---|
1145 | =item No handler for %s events
|
---|
1146 |
|
---|
1147 | (F) The first argument to $p->handler must be a valid event name; i.e. one
|
---|
1148 | of "start", "end", "text", "process", "declaration" or "comment".
|
---|
1149 |
|
---|
1150 | =item Unrecognized identifier %s in argspec
|
---|
1151 |
|
---|
1152 | (F) The identifier is not a known argspec name.
|
---|
1153 | Use one of the names mentioned in the argspec section above.
|
---|
1154 |
|
---|
1155 | =item Literal string is longer than 255 chars in argspec
|
---|
1156 |
|
---|
1157 | (F) The current implementation limits the length of literals in
|
---|
1158 | an argspec to 255 characters. Make the literal shorter.
|
---|
1159 |
|
---|
1160 | =item Backslash reserved for literal string in argspec
|
---|
1161 |
|
---|
1162 | (F) The backslash character "\" is not allowed in argspec literals.
|
---|
1163 | It is reserved to permit quoting inside a literal in a later version.
|
---|
1164 |
|
---|
1165 | =item Unterminated literal string in argspec
|
---|
1166 |
|
---|
1167 | (F) The terminating quote character for a literal was not found.
|
---|
1168 |
|
---|
1169 | =item Bad argspec (%s)
|
---|
1170 |
|
---|
1171 | (F) Only identifier names, literals, spaces and commas
|
---|
1172 | are allowed in argspecs.
|
---|
1173 |
|
---|
1174 | =item Missing comma separator in argspec
|
---|
1175 |
|
---|
1176 | (F) Identifiers in an argspec must be separated with ",".
|
---|
1177 |
|
---|
1178 | =item Parsing of undecoded UTF-8 will give garbage when decoding entities
|
---|
1179 |
|
---|
1180 | (W) The first chunk parsed appears to contain undecoded UTF-8 and one
|
---|
1181 | or more argspecs that decode entities are used for the callback
|
---|
1182 | handlers.
|
---|
1183 |
|
---|
1184 | The result of decoding will be a mix of encoded and decoded characters
|
---|
1185 | for any entities that expand to characters with code above 127. This
|
---|
1186 | is not a good thing.
|
---|
1187 |
|
---|
1188 | The solution is to use the Encode::encode_utf8() on the data before
|
---|
1189 | feeding it to the $p->parse(). For $p->parse_file() pass a file that
|
---|
1190 | has been opened in ":utf8" mode.
|
---|
1191 |
|
---|
1192 | The parser can process raw undecoded UTF-8 sanely if the C<utf8_mode>
|
---|
1193 | is enabled or if the "attr", "@attr" or "dtext" argspecs is avoided.
|
---|
1194 |
|
---|
1195 | =item Parsing string decoded with wrong endianess
|
---|
1196 |
|
---|
1197 | (W) The first character in the document is U+FFFE. This is not a
|
---|
1198 | legal Unicode character but a byte swapped BOM. The result of parsing
|
---|
1199 | will likely be garbage.
|
---|
1200 |
|
---|
1201 | =item Parsing of undecoded UTF-32
|
---|
1202 |
|
---|
1203 | (W) The parser found the Unicode UTF-32 BOM signature at the start
|
---|
1204 | of the document. The result of parsing will likely be garbage.
|
---|
1205 |
|
---|
1206 | =item Parsing of undecoded UTF-16
|
---|
1207 |
|
---|
1208 | (W) The parser found the Unicode UTF-16 BOM signature at the start of
|
---|
1209 | the document. The result of parsing will likely be garbage.
|
---|
1210 |
|
---|
1211 | =back
|
---|
1212 |
|
---|
1213 | =head1 SEE ALSO
|
---|
1214 |
|
---|
1215 | L<HTML::Entities>, L<HTML::PullParser>, L<HTML::TokeParser>, L<HTML::HeadParser>,
|
---|
1216 | L<HTML::LinkExtor>, L<HTML::Form>
|
---|
1217 |
|
---|
1218 | L<HTML::TreeBuilder> (part of the I<HTML-Tree> distribution)
|
---|
1219 |
|
---|
1220 | http://www.w3.org/TR/html4
|
---|
1221 |
|
---|
1222 | More information about marked sections and processing instructions may
|
---|
1223 | be found at C<http://www.sgml.u-net.com/book/sgml-8.htm>.
|
---|
1224 |
|
---|
1225 | =head1 COPYRIGHT
|
---|
1226 |
|
---|
1227 | Copyright 1996-2007 Gisle Aas. All rights reserved.
|
---|
1228 | Copyright 1999-2000 Michael A. Chase. All rights reserved.
|
---|
1229 |
|
---|
1230 | This library is free software; you can redistribute it and/or
|
---|
1231 | modify it under the same terms as Perl itself.
|
---|
1232 |
|
---|
1233 | =cut
|
---|