1 | package HTML::Tagset;
|
---|
2 |
|
---|
3 | use strict;
|
---|
4 |
|
---|
5 | =head1 NAME
|
---|
6 |
|
---|
7 | HTML::Tagset - data tables useful in parsing HTML
|
---|
8 |
|
---|
9 | =head1 VERSION
|
---|
10 |
|
---|
11 | Version 3.10
|
---|
12 |
|
---|
13 | =cut
|
---|
14 |
|
---|
15 | use vars qw( $VERSION );
|
---|
16 |
|
---|
17 | $VERSION = '3.10';
|
---|
18 |
|
---|
19 | =head1 SYNOPSIS
|
---|
20 |
|
---|
21 | use HTML::Tagset;
|
---|
22 | # Then use any of the items in the HTML::Tagset package
|
---|
23 | # as need arises
|
---|
24 |
|
---|
25 | =head1 DESCRIPTION
|
---|
26 |
|
---|
27 | This module contains several data tables useful in various kinds of
|
---|
28 | HTML parsing operations.
|
---|
29 |
|
---|
30 | Note that all tag names used are lowercase.
|
---|
31 |
|
---|
32 | In the following documentation, a "hashset" is a hash being used as a
|
---|
33 | set -- the hash conveys that its keys are there, and the actual values
|
---|
34 | associated with the keys are not significant. (But what values are
|
---|
35 | there, are always true.)
|
---|
36 |
|
---|
37 | =cut
|
---|
38 |
|
---|
39 | use vars qw(
|
---|
40 | $VERSION
|
---|
41 | %emptyElement %optionalEndTag %linkElements %boolean_attr
|
---|
42 | %isHeadElement %isBodyElement %isPhraseMarkup
|
---|
43 | %is_Possible_Strict_P_Content
|
---|
44 | %isHeadOrBodyElement
|
---|
45 | %isList %isTableElement %isFormElement
|
---|
46 | %isKnown %canTighten
|
---|
47 | @p_closure_barriers
|
---|
48 | %isCDATA_Parent
|
---|
49 | );
|
---|
50 |
|
---|
51 | =head1 VARIABLES
|
---|
52 |
|
---|
53 | Note that none of these variables are exported.
|
---|
54 |
|
---|
55 | =head2 hashset %HTML::Tagset::emptyElement
|
---|
56 |
|
---|
57 | This hashset has as values the tag-names (GIs) of elements that cannot
|
---|
58 | have content. (For example, "base", "br", "hr".) So
|
---|
59 | C<$HTML::Tagset::emptyElement{'hr'}> exists and is true.
|
---|
60 | C<$HTML::Tagset::emptyElement{'dl'}> does not exist, and so is not true.
|
---|
61 |
|
---|
62 | =cut
|
---|
63 |
|
---|
64 | %emptyElement = map {; $_ => 1 } qw(base link meta isindex
|
---|
65 | img br hr wbr
|
---|
66 | input area param
|
---|
67 | embed bgsound spacer
|
---|
68 | basefont col frame
|
---|
69 | ~comment ~literal
|
---|
70 | ~declaration ~pi
|
---|
71 | );
|
---|
72 | # The "~"-initial names are for pseudo-elements used by HTML::Entities
|
---|
73 | # and TreeBuilder
|
---|
74 |
|
---|
75 | =head2 hashset %HTML::Tagset::optionalEndTag
|
---|
76 |
|
---|
77 | This hashset lists tag-names for elements that can have content, but whose
|
---|
78 | end-tags are generally, "safely", omissible. Example:
|
---|
79 | C<$HTML::Tagset::emptyElement{'li'}> exists and is true.
|
---|
80 |
|
---|
81 | =cut
|
---|
82 |
|
---|
83 | %optionalEndTag = map {; $_ => 1 } qw(p li dt dd); # option th tr td);
|
---|
84 |
|
---|
85 | =head2 hash %HTML::Tagset::linkElements
|
---|
86 |
|
---|
87 | Values in this hash are tagnames for elements that might contain
|
---|
88 | links, and the value for each is a reference to an array of the names
|
---|
89 | of attributes whose values can be links.
|
---|
90 |
|
---|
91 | =cut
|
---|
92 |
|
---|
93 | %linkElements =
|
---|
94 | (
|
---|
95 | 'a' => ['href'],
|
---|
96 | 'applet' => ['archive', 'codebase', 'code'],
|
---|
97 | 'area' => ['href'],
|
---|
98 | 'base' => ['href'],
|
---|
99 | 'bgsound' => ['src'],
|
---|
100 | 'blockquote' => ['cite'],
|
---|
101 | 'body' => ['background'],
|
---|
102 | 'del' => ['cite'],
|
---|
103 | 'embed' => ['pluginspage', 'src'],
|
---|
104 | 'form' => ['action'],
|
---|
105 | 'frame' => ['src', 'longdesc'],
|
---|
106 | 'iframe' => ['src', 'longdesc'],
|
---|
107 | 'ilayer' => ['background'],
|
---|
108 | 'img' => ['src', 'lowsrc', 'longdesc', 'usemap'],
|
---|
109 | 'input' => ['src', 'usemap'],
|
---|
110 | 'ins' => ['cite'],
|
---|
111 | 'isindex' => ['action'],
|
---|
112 | 'head' => ['profile'],
|
---|
113 | 'layer' => ['background', 'src'],
|
---|
114 | 'link' => ['href'],
|
---|
115 | 'object' => ['classid', 'codebase', 'data', 'archive', 'usemap'],
|
---|
116 | 'q' => ['cite'],
|
---|
117 | 'script' => ['src', 'for'],
|
---|
118 | 'table' => ['background'],
|
---|
119 | 'td' => ['background'],
|
---|
120 | 'th' => ['background'],
|
---|
121 | 'tr' => ['background'],
|
---|
122 | 'xmp' => ['href'],
|
---|
123 | );
|
---|
124 |
|
---|
125 | =head2 hash %HTML::Tagset::boolean_attr
|
---|
126 |
|
---|
127 | This hash (not hashset) lists what attributes of what elements can be
|
---|
128 | printed without showing the value (for example, the "noshade" attribute
|
---|
129 | of "hr" elements). For elements with only one such attribute, its value
|
---|
130 | is simply that attribute name. For elements with many such attributes,
|
---|
131 | the value is a reference to a hashset containing all such attributes.
|
---|
132 |
|
---|
133 | =cut
|
---|
134 |
|
---|
135 | %boolean_attr = (
|
---|
136 | # TODO: make these all hashes
|
---|
137 | 'area' => 'nohref',
|
---|
138 | 'dir' => 'compact',
|
---|
139 | 'dl' => 'compact',
|
---|
140 | 'hr' => 'noshade',
|
---|
141 | 'img' => 'ismap',
|
---|
142 | 'input' => { 'checked' => 1, 'readonly' => 1, 'disabled' => 1 },
|
---|
143 | 'menu' => 'compact',
|
---|
144 | 'ol' => 'compact',
|
---|
145 | 'option' => 'selected',
|
---|
146 | 'select' => 'multiple',
|
---|
147 | 'td' => 'nowrap',
|
---|
148 | 'th' => 'nowrap',
|
---|
149 | 'ul' => 'compact',
|
---|
150 | );
|
---|
151 |
|
---|
152 | #==========================================================================
|
---|
153 | # List of all elements from Extensible HTML version 1.0 Transitional DTD:
|
---|
154 | #
|
---|
155 | # a abbr acronym address applet area b base basefont bdo big
|
---|
156 | # blockquote body br button caption center cite code col colgroup
|
---|
157 | # dd del dfn dir div dl dt em fieldset font form h1 h2 h3 h4 h5 h6
|
---|
158 | # head hr html i iframe img input ins isindex kbd label legend li
|
---|
159 | # link map menu meta noframes noscript object ol optgroup option p
|
---|
160 | # param pre q s samp script select small span strike strong style
|
---|
161 | # sub sup table tbody td textarea tfoot th thead title tr tt u ul
|
---|
162 | # var
|
---|
163 | #
|
---|
164 | # Varia from Mozilla source internal table of tags:
|
---|
165 | # Implemented:
|
---|
166 | # xmp listing wbr nobr frame frameset noframes ilayer
|
---|
167 | # layer nolayer spacer embed multicol
|
---|
168 | # But these are unimplemented:
|
---|
169 | # sound?? keygen?? server??
|
---|
170 | # Also seen here and there:
|
---|
171 | # marquee?? app?? (both unimplemented)
|
---|
172 | #==========================================================================
|
---|
173 |
|
---|
174 | =head2 hashset %HTML::Tagset::isPhraseMarkup
|
---|
175 |
|
---|
176 | This hashset contains all phrasal-level elements.
|
---|
177 |
|
---|
178 | =cut
|
---|
179 |
|
---|
180 | %isPhraseMarkup = map {; $_ => 1 } qw(
|
---|
181 | span abbr acronym q sub sup
|
---|
182 | cite code em kbd samp strong var dfn strike
|
---|
183 | b i u s tt small big
|
---|
184 | a img br
|
---|
185 | wbr nobr blink
|
---|
186 | font basefont bdo
|
---|
187 | spacer embed noembed
|
---|
188 | ); # had: center, hr, table
|
---|
189 |
|
---|
190 |
|
---|
191 | =head2 hashset %HTML::Tagset::is_Possible_Strict_P_Content
|
---|
192 |
|
---|
193 | This hashset contains all phrasal-level elements that be content of a
|
---|
194 | P element, for a strict model of HTML.
|
---|
195 |
|
---|
196 | =cut
|
---|
197 |
|
---|
198 | %is_Possible_Strict_P_Content = (
|
---|
199 | %isPhraseMarkup,
|
---|
200 | %isFormElement,
|
---|
201 | map {; $_ => 1} qw( object script map )
|
---|
202 | # I've no idea why there's these latter exceptions.
|
---|
203 | # I'm just following the HTML4.01 DTD.
|
---|
204 | );
|
---|
205 |
|
---|
206 | #from html4 strict:
|
---|
207 | #<!ENTITY % fontstyle "TT | I | B | BIG | SMALL">
|
---|
208 | #
|
---|
209 | #<!ENTITY % phrase "EM | STRONG | DFN | CODE |
|
---|
210 | # SAMP | KBD | VAR | CITE | ABBR | ACRONYM" >
|
---|
211 | #
|
---|
212 | #<!ENTITY % special
|
---|
213 | # "A | IMG | OBJECT | BR | SCRIPT | MAP | Q | SUB | SUP | SPAN | BDO">
|
---|
214 | #
|
---|
215 | #<!ENTITY % formctrl "INPUT | SELECT | TEXTAREA | LABEL | BUTTON">
|
---|
216 | #
|
---|
217 | #<!-- %inline; covers inline or "text-level" elements -->
|
---|
218 | #<!ENTITY % inline "#PCDATA | %fontstyle; | %phrase; | %special; | %formctrl;">
|
---|
219 |
|
---|
220 | =head2 hashset %HTML::Tagset::isHeadElement
|
---|
221 |
|
---|
222 | This hashset contains all elements that elements that should be
|
---|
223 | present only in the 'head' element of an HTML document.
|
---|
224 |
|
---|
225 | =cut
|
---|
226 |
|
---|
227 | %isHeadElement = map {; $_ => 1 }
|
---|
228 | qw(title base link meta isindex script style object bgsound);
|
---|
229 |
|
---|
230 | =head2 hashset %HTML::Tagset::isList
|
---|
231 |
|
---|
232 | This hashset contains all elements that can contain "li" elements.
|
---|
233 |
|
---|
234 | =cut
|
---|
235 |
|
---|
236 | %isList = map {; $_ => 1 } qw(ul ol dir menu);
|
---|
237 |
|
---|
238 | =head2 hashset %HTML::Tagset::isTableElement
|
---|
239 |
|
---|
240 | This hashset contains all elements that are to be found only in/under
|
---|
241 | a "table" element.
|
---|
242 |
|
---|
243 | =cut
|
---|
244 |
|
---|
245 | %isTableElement = map {; $_ => 1 }
|
---|
246 | qw(tr td th thead tbody tfoot caption col colgroup);
|
---|
247 |
|
---|
248 | =head2 hashset %HTML::Tagset::isFormElement
|
---|
249 |
|
---|
250 | This hashset contains all elements that are to be found only in/under
|
---|
251 | a "form" element.
|
---|
252 |
|
---|
253 | =cut
|
---|
254 |
|
---|
255 | %isFormElement = map {; $_ => 1 }
|
---|
256 | qw(input select option optgroup textarea button label);
|
---|
257 |
|
---|
258 | =head2 hashset %HTML::Tagset::isBodyMarkup
|
---|
259 |
|
---|
260 | This hashset contains all elements that are to be found only in/under
|
---|
261 | the "body" element of an HTML document.
|
---|
262 |
|
---|
263 | =cut
|
---|
264 |
|
---|
265 | %isBodyElement = map {; $_ => 1 } qw(
|
---|
266 | h1 h2 h3 h4 h5 h6
|
---|
267 | p div pre plaintext address blockquote
|
---|
268 | xmp listing
|
---|
269 | center
|
---|
270 |
|
---|
271 | multicol
|
---|
272 | iframe ilayer nolayer
|
---|
273 | bgsound
|
---|
274 |
|
---|
275 | hr
|
---|
276 | ol ul dir menu li
|
---|
277 | dl dt dd
|
---|
278 | ins del
|
---|
279 |
|
---|
280 | fieldset legend
|
---|
281 |
|
---|
282 | map area
|
---|
283 | applet param object
|
---|
284 | isindex script noscript
|
---|
285 | table
|
---|
286 | center
|
---|
287 | form
|
---|
288 | ),
|
---|
289 | keys %isFormElement,
|
---|
290 | keys %isPhraseMarkup, # And everything phrasal
|
---|
291 | keys %isTableElement,
|
---|
292 | ;
|
---|
293 |
|
---|
294 |
|
---|
295 | =head2 hashset %HTML::Tagset::isHeadOrBodyElement
|
---|
296 |
|
---|
297 | This hashset includes all elements that I notice can fall either in
|
---|
298 | the head or in the body.
|
---|
299 |
|
---|
300 | =cut
|
---|
301 |
|
---|
302 | %isHeadOrBodyElement = map {; $_ => 1 }
|
---|
303 | qw(script isindex style object map area param noscript bgsound);
|
---|
304 | # i.e., if we find 'script' in the 'body' or the 'head', don't freak out.
|
---|
305 |
|
---|
306 |
|
---|
307 | =head2 hashset %HTML::Tagset::isKnown
|
---|
308 |
|
---|
309 | This hashset lists all known HTML elements.
|
---|
310 |
|
---|
311 | =cut
|
---|
312 |
|
---|
313 | %isKnown = (%isHeadElement, %isBodyElement,
|
---|
314 | map{; $_=>1 }
|
---|
315 | qw( head body html
|
---|
316 | frame frameset noframes
|
---|
317 | ~comment ~pi ~directive ~literal
|
---|
318 | ));
|
---|
319 | # that should be all known tags ever ever
|
---|
320 |
|
---|
321 |
|
---|
322 | =head2 hashset %HTML::Tagset::canTighten
|
---|
323 |
|
---|
324 | This hashset lists elements that might have ignorable whitespace as
|
---|
325 | children or siblings.
|
---|
326 |
|
---|
327 | =cut
|
---|
328 |
|
---|
329 | %canTighten = %isKnown;
|
---|
330 | delete @canTighten{
|
---|
331 | keys(%isPhraseMarkup), 'input', 'select',
|
---|
332 | 'xmp', 'listing', 'plaintext', 'pre',
|
---|
333 | };
|
---|
334 | # xmp, listing, plaintext, and pre are untightenable, and
|
---|
335 | # in a really special way.
|
---|
336 | @canTighten{'hr','br'} = (1,1);
|
---|
337 | # exceptional 'phrasal' things that ARE subject to tightening.
|
---|
338 |
|
---|
339 | # The one case where I can think of my tightening rules failing is:
|
---|
340 | # <p>foo bar<center> <em>baz quux</em> ...
|
---|
341 | # ^-- that would get deleted.
|
---|
342 | # But that's pretty gruesome code anyhow. You gets what you pays for.
|
---|
343 |
|
---|
344 | #==========================================================================
|
---|
345 |
|
---|
346 | =head2 array @HTML::Tagset::p_closure_barriers
|
---|
347 |
|
---|
348 | This array has a meaning that I have only seen a need for in
|
---|
349 | C<HTML::TreeBuilder>, but I include it here on the off chance that someone
|
---|
350 | might find it of use:
|
---|
351 |
|
---|
352 | When we see a "E<lt>pE<gt>" token, we go lookup up the lineage for a p
|
---|
353 | element we might have to minimize. At first sight, we might say that
|
---|
354 | if there's a p anywhere in the lineage of this new p, it should be
|
---|
355 | closed. But that's wrong. Consider this document:
|
---|
356 |
|
---|
357 | <html>
|
---|
358 | <head>
|
---|
359 | <title>foo</title>
|
---|
360 | </head>
|
---|
361 | <body>
|
---|
362 | <p>foo
|
---|
363 | <table>
|
---|
364 | <tr>
|
---|
365 | <td>
|
---|
366 | foo
|
---|
367 | <p>bar
|
---|
368 | </td>
|
---|
369 | </tr>
|
---|
370 | </table>
|
---|
371 | </p>
|
---|
372 | </body>
|
---|
373 | </html>
|
---|
374 |
|
---|
375 | The second p is quite legally inside a much higher p.
|
---|
376 |
|
---|
377 | My formalization of the reason why this is legal, but this:
|
---|
378 |
|
---|
379 | <p>foo<p>bar</p></p>
|
---|
380 |
|
---|
381 | isn't, is that something about the table constitutes a "barrier" to
|
---|
382 | the application of the rule about what p must minimize.
|
---|
383 |
|
---|
384 | So C<@HTML::Tagset::p_closure_barriers> is the list of all such
|
---|
385 | barrier-tags.
|
---|
386 |
|
---|
387 | =cut
|
---|
388 |
|
---|
389 | @p_closure_barriers = qw(
|
---|
390 | li blockquote
|
---|
391 | ul ol menu dir
|
---|
392 | dl dt dd
|
---|
393 | td th tr table caption
|
---|
394 | );
|
---|
395 |
|
---|
396 | # In an ideal world (i.e., XHTML) we wouldn't have to bother with any of this
|
---|
397 | # monkey business of barriers to minimization!
|
---|
398 |
|
---|
399 | =head2 hashset %isCDATA_Parent
|
---|
400 |
|
---|
401 | This hashset includes all elements whose content is CDATA.
|
---|
402 |
|
---|
403 | =cut
|
---|
404 |
|
---|
405 | %isCDATA_Parent = map {; $_ => 1 }
|
---|
406 | qw(script style xmp listing plaintext);
|
---|
407 |
|
---|
408 | # TODO: there's nothing else that takes CDATA children, right?
|
---|
409 |
|
---|
410 | # As the HTML3 DTD (Raggett 1995-04-24) noted:
|
---|
411 | # The XMP, LISTING and PLAINTEXT tags are incompatible with SGML
|
---|
412 | # and derive from very early versions of HTML. They require non-
|
---|
413 | # standard parsers and will cause problems for processing
|
---|
414 | # documents with standard SGML tools.
|
---|
415 |
|
---|
416 |
|
---|
417 | =head1 CAVEATS
|
---|
418 |
|
---|
419 | You may find it useful to alter the behavior of modules (like
|
---|
420 | C<HTML::Element> or C<HTML::TreeBuilder>) that use C<HTML::Tagset>'s
|
---|
421 | data tables by altering the data tables themselves. You are welcome
|
---|
422 | to try, but be careful; and be aware that different modules may or may
|
---|
423 | react differently to the data tables being changed.
|
---|
424 |
|
---|
425 | Note that it may be inappropriate to use these tables for I<producing>
|
---|
426 | HTML -- for example, C<%isHeadOrBodyElement> lists the tagnames
|
---|
427 | for all elements that can appear either in the head or in the body,
|
---|
428 | such as "script". That doesn't mean that I am saying your code that
|
---|
429 | produces HTML should feel free to put script elements in either place!
|
---|
430 | If you are producing programs that spit out HTML, you should be
|
---|
431 | I<intimately> familiar with the DTDs for HTML or XHTML (available at
|
---|
432 | C<http://www.w3.org/>), and you should slavishly obey them, not
|
---|
433 | the data tables in this document.
|
---|
434 |
|
---|
435 | =head1 SEE ALSO
|
---|
436 |
|
---|
437 | L<HTML::Element>, L<HTML::TreeBuilder>, L<HTML::LinkExtor>
|
---|
438 |
|
---|
439 | =head1 COPYRIGHT & LICENSE
|
---|
440 |
|
---|
441 | Copyright 1995-2000 Gisle Aas.
|
---|
442 |
|
---|
443 | Copyright 2000-2005 Sean M. Burke.
|
---|
444 |
|
---|
445 | Copyright 2005 Andy Lester.
|
---|
446 |
|
---|
447 | This program is free software; you can redistribute it and/or modify it
|
---|
448 | under the same terms as Perl itself.
|
---|
449 |
|
---|
450 | =head1 ACKNOWLEDGEMENTS
|
---|
451 |
|
---|
452 | Most of the code/data in this module was adapted from code written
|
---|
453 | by Gisle Aas for C<HTML::Element>, C<HTML::TreeBuilder>, and
|
---|
454 | C<HTML::LinkExtor>. Then it was maintained by Sean M. Burke.
|
---|
455 |
|
---|
456 | =head1 AUTHOR
|
---|
457 |
|
---|
458 | Current maintainer: Andy Lester, C<< <andy at petdance.com> >>
|
---|
459 |
|
---|
460 | =head1 BUGS
|
---|
461 |
|
---|
462 | Please report any bugs or feature requests to
|
---|
463 | C<bug-html-tagset at rt.cpan.org>, or through the web interface at
|
---|
464 | L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-Tagset>. I will
|
---|
465 | be notified, and then you'll automatically be notified of progress on
|
---|
466 | your bug as I make changes.
|
---|
467 |
|
---|
468 | =cut
|
---|
469 |
|
---|
470 | 1;
|
---|