source: trunk/gsdl/perllib/cpan/HTML/Entities.pm@ 14078

Last change on this file since 14078 was 14078, checked in by lh92, 17 years ago

Perl modules required for HTMLTidy

  • Property svn:keywords set to Author Date Id Revision
File size: 14.8 KB
Line 
1package HTML::Entities;
2
3# $Id: Entities.pm 14078 2007-05-17 03:15:41Z lh92 $
4
5=head1 NAME
6
7HTML::Entities - Encode or decode strings with HTML entities
8
9=head1 SYNOPSIS
10
11 use HTML::Entities;
12
13 $a = "Våre norske tegn bør &#230res";
14 decode_entities($a);
15 encode_entities($a, "\200-\377");
16
17For example, this:
18
19 $input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
20 print encode_entities($input), "\n"
21
22Prints this out:
23
24 vis-à-vis Beyoncé's naïve
25 papier-mâché résumé
26
27=head1 DESCRIPTION
28
29This module deals with encoding and decoding of strings with HTML
30character entities. The module provides the following functions:
31
32=over 4
33
34=item decode_entities( $string, ... )
35
36This routine replaces HTML entities found in the $string with the
37corresponding Unicode character. Under perl 5.6 and earlier only
38characters in the Latin-1 range are replaced. Unrecognized
39entities are left alone.
40
41If multiple strings are provided as argument they are each decoded
42separately and the same number of strings are returned.
43
44If called in void context the arguments are decoded in-place.
45
46This routine is exported by default.
47
48=item _decode_entities( $string, \%entity2char )
49
50=item _decode_entities( $string, \%entity2char, $expand_prefix )
51
52This will in-place replace HTML entities in $string. The %entity2char
53hash must be provided. Named entities not found in the %entity2char
54hash are left alone. Numeric entities are expanded unless their value
55overflow.
56
57The keys in %entity2char are the entity names to be expanded and their
58values are what they should expand into. The values do not have to be
59single character strings. If a key has ";" as suffix,
60then occurrences in $string are only expanded if properly terminated
61with ";". Entities without ";" will be expanded regardless of how
62they are terminated for compatiblity with how common browsers treat
63entities in the Latin-1 range.
64
65If $expand_prefix is TRUE then entities without trailing ";" in
66%entity2char will even be expanded as a prefix of a longer
67unrecognized name. The longest matching name in %entity2char will be
68used. This is mainly present for compatibility with an MSIE
69misfeature.
70
71 $string = "foo&nbspbar";
72 _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
73 print $string; # will print "foo bar"
74
75This routine is exported by default.
76
77=item encode_entities( $string )
78
79=item encode_entities( $string, $unsafe_chars )
80
81This routine replaces unsafe characters in $string with their entity
82representation. A second argument can be given to specify which
83characters to consider unsafe (i.e., which to escape). The default set
84of characters to encode are control chars, high-bit chars, and the
85C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >>
86characters. But this, for example, would encode I<just> the
87C<< < >>, C<< & >>, C<< > >>, and C<< " >> characters:
88
89 $encoded = encode_entities($input, '<>&"');
90
91This routine is exported by default.
92
93=item encode_entities_numeric( $string )
94
95=item encode_entities_numeric( $string, $unsafe_chars )
96
97This routine works just like encode_entities, except that the replacement
98entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For
99example, C<encode_entities("r\xF4le")> returns "r&ocirc;le", but
100C<encode_entities_numeric("r\xF4le")> returns "r&#xF4;le".
101
102This routine is I<not> exported by default. But you can always
103export it with C<use HTML::Entities qw(encode_entities_numeric);>
104or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
105
106=back
107
108All these routines modify the string passed as the first argument, if
109called in a void context. In scalar and array contexts, the encoded or
110decoded string is returned (without changing the input string).
111
112If you prefer not to import these routines into your namespace, you can
113call them as:
114
115 use HTML::Entities ();
116 $decoded = HTML::Entities::decode($a);
117 $encoded = HTML::Entities::encode($a);
118 $encoded = HTML::Entities::encode_numeric($a);
119
120The module can also export the %char2entity and the %entity2char
121hashes, which contain the mapping from all characters to the
122corresponding entities (and vice versa, respectively).
123
124=head1 COPYRIGHT
125
126Copyright 1995-2006 Gisle Aas. All rights reserved.
127
128This library is free software; you can redistribute it and/or
129modify it under the same terms as Perl itself.
130
131=cut
132
133use strict;
134use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
135use vars qw(%entity2char %char2entity);
136
137require 5.004;
138require Exporter;
139@ISA = qw(Exporter);
140
141@EXPORT = qw(encode_entities decode_entities _decode_entities);
142@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
143
144$VERSION = sprintf("%d.%02d", q$Revision: 14078 $ =~ /(\d+)\.(\d+)/);
145sub Version { $VERSION; }
146
147require HTML::Parser; # for fast XS implemented decode_entities
148
149
150%entity2char = (
151 # Some normal chars that have special meaning in SGML context
152 amp => '&', # ampersand
153'gt' => '>', # greater than
154'lt' => '<', # less than
155 quot => '"', # double quote
156 apos => "'", # single quote
157
158 # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
159 AElig => chr(198), # capital AE diphthong (ligature)
160 Aacute => chr(193), # capital A, acute accent
161 Acirc => chr(194), # capital A, circumflex accent
162 Agrave => chr(192), # capital A, grave accent
163 Aring => chr(197), # capital A, ring
164 Atilde => chr(195), # capital A, tilde
165 Auml => chr(196), # capital A, dieresis or umlaut mark
166 Ccedil => chr(199), # capital C, cedilla
167 ETH => chr(208), # capital Eth, Icelandic
168 Eacute => chr(201), # capital E, acute accent
169 Ecirc => chr(202), # capital E, circumflex accent
170 Egrave => chr(200), # capital E, grave accent
171 Euml => chr(203), # capital E, dieresis or umlaut mark
172 Iacute => chr(205), # capital I, acute accent
173 Icirc => chr(206), # capital I, circumflex accent
174 Igrave => chr(204), # capital I, grave accent
175 Iuml => chr(207), # capital I, dieresis or umlaut mark
176 Ntilde => chr(209), # capital N, tilde
177 Oacute => chr(211), # capital O, acute accent
178 Ocirc => chr(212), # capital O, circumflex accent
179 Ograve => chr(210), # capital O, grave accent
180 Oslash => chr(216), # capital O, slash
181 Otilde => chr(213), # capital O, tilde
182 Ouml => chr(214), # capital O, dieresis or umlaut mark
183 THORN => chr(222), # capital THORN, Icelandic
184 Uacute => chr(218), # capital U, acute accent
185 Ucirc => chr(219), # capital U, circumflex accent
186 Ugrave => chr(217), # capital U, grave accent
187 Uuml => chr(220), # capital U, dieresis or umlaut mark
188 Yacute => chr(221), # capital Y, acute accent
189 aacute => chr(225), # small a, acute accent
190 acirc => chr(226), # small a, circumflex accent
191 aelig => chr(230), # small ae diphthong (ligature)
192 agrave => chr(224), # small a, grave accent
193 aring => chr(229), # small a, ring
194 atilde => chr(227), # small a, tilde
195 auml => chr(228), # small a, dieresis or umlaut mark
196 ccedil => chr(231), # small c, cedilla
197 eacute => chr(233), # small e, acute accent
198 ecirc => chr(234), # small e, circumflex accent
199 egrave => chr(232), # small e, grave accent
200 eth => chr(240), # small eth, Icelandic
201 euml => chr(235), # small e, dieresis or umlaut mark
202 iacute => chr(237), # small i, acute accent
203 icirc => chr(238), # small i, circumflex accent
204 igrave => chr(236), # small i, grave accent
205 iuml => chr(239), # small i, dieresis or umlaut mark
206 ntilde => chr(241), # small n, tilde
207 oacute => chr(243), # small o, acute accent
208 ocirc => chr(244), # small o, circumflex accent
209 ograve => chr(242), # small o, grave accent
210 oslash => chr(248), # small o, slash
211 otilde => chr(245), # small o, tilde
212 ouml => chr(246), # small o, dieresis or umlaut mark
213 szlig => chr(223), # small sharp s, German (sz ligature)
214 thorn => chr(254), # small thorn, Icelandic
215 uacute => chr(250), # small u, acute accent
216 ucirc => chr(251), # small u, circumflex accent
217 ugrave => chr(249), # small u, grave accent
218 uuml => chr(252), # small u, dieresis or umlaut mark
219 yacute => chr(253), # small y, acute accent
220 yuml => chr(255), # small y, dieresis or umlaut mark
221
222 # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
223 copy => chr(169), # copyright sign
224 reg => chr(174), # registered sign
225 nbsp => chr(160), # non breaking space
226
227 # Additional ISO-8859/1 entities listed in rfc1866 (section 14)
228 iexcl => chr(161),
229 cent => chr(162),
230 pound => chr(163),
231 curren => chr(164),
232 yen => chr(165),
233 brvbar => chr(166),
234 sect => chr(167),
235 uml => chr(168),
236 ordf => chr(170),
237 laquo => chr(171),
238'not' => chr(172), # not is a keyword in perl
239 shy => chr(173),
240 macr => chr(175),
241 deg => chr(176),
242 plusmn => chr(177),
243 sup1 => chr(185),
244 sup2 => chr(178),
245 sup3 => chr(179),
246 acute => chr(180),
247 micro => chr(181),
248 para => chr(182),
249 middot => chr(183),
250 cedil => chr(184),
251 ordm => chr(186),
252 raquo => chr(187),
253 frac14 => chr(188),
254 frac12 => chr(189),
255 frac34 => chr(190),
256 iquest => chr(191),
257'times' => chr(215), # times is a keyword in perl
258 divide => chr(247),
259
260 ( $] > 5.007 ? (
261 'OElig;' => chr(338),
262 'oelig;' => chr(339),
263 'Scaron;' => chr(352),
264 'scaron;' => chr(353),
265 'Yuml;' => chr(376),
266 'fnof;' => chr(402),
267 'circ;' => chr(710),
268 'tilde;' => chr(732),
269 'Alpha;' => chr(913),
270 'Beta;' => chr(914),
271 'Gamma;' => chr(915),
272 'Delta;' => chr(916),
273 'Epsilon;' => chr(917),
274 'Zeta;' => chr(918),
275 'Eta;' => chr(919),
276 'Theta;' => chr(920),
277 'Iota;' => chr(921),
278 'Kappa;' => chr(922),
279 'Lambda;' => chr(923),
280 'Mu;' => chr(924),
281 'Nu;' => chr(925),
282 'Xi;' => chr(926),
283 'Omicron;' => chr(927),
284 'Pi;' => chr(928),
285 'Rho;' => chr(929),
286 'Sigma;' => chr(931),
287 'Tau;' => chr(932),
288 'Upsilon;' => chr(933),
289 'Phi;' => chr(934),
290 'Chi;' => chr(935),
291 'Psi;' => chr(936),
292 'Omega;' => chr(937),
293 'alpha;' => chr(945),
294 'beta;' => chr(946),
295 'gamma;' => chr(947),
296 'delta;' => chr(948),
297 'epsilon;' => chr(949),
298 'zeta;' => chr(950),
299 'eta;' => chr(951),
300 'theta;' => chr(952),
301 'iota;' => chr(953),
302 'kappa;' => chr(954),
303 'lambda;' => chr(955),
304 'mu;' => chr(956),
305 'nu;' => chr(957),
306 'xi;' => chr(958),
307 'omicron;' => chr(959),
308 'pi;' => chr(960),
309 'rho;' => chr(961),
310 'sigmaf;' => chr(962),
311 'sigma;' => chr(963),
312 'tau;' => chr(964),
313 'upsilon;' => chr(965),
314 'phi;' => chr(966),
315 'chi;' => chr(967),
316 'psi;' => chr(968),
317 'omega;' => chr(969),
318 'thetasym;' => chr(977),
319 'upsih;' => chr(978),
320 'piv;' => chr(982),
321 'ensp;' => chr(8194),
322 'emsp;' => chr(8195),
323 'thinsp;' => chr(8201),
324 'zwnj;' => chr(8204),
325 'zwj;' => chr(8205),
326 'lrm;' => chr(8206),
327 'rlm;' => chr(8207),
328 'ndash;' => chr(8211),
329 'mdash;' => chr(8212),
330 'lsquo;' => chr(8216),
331 'rsquo;' => chr(8217),
332 'sbquo;' => chr(8218),
333 'ldquo;' => chr(8220),
334 'rdquo;' => chr(8221),
335 'bdquo;' => chr(8222),
336 'dagger;' => chr(8224),
337 'Dagger;' => chr(8225),
338 'bull;' => chr(8226),
339 'hellip;' => chr(8230),
340 'permil;' => chr(8240),
341 'prime;' => chr(8242),
342 'Prime;' => chr(8243),
343 'lsaquo;' => chr(8249),
344 'rsaquo;' => chr(8250),
345 'oline;' => chr(8254),
346 'frasl;' => chr(8260),
347 'euro;' => chr(8364),
348 'image;' => chr(8465),
349 'weierp;' => chr(8472),
350 'real;' => chr(8476),
351 'trade;' => chr(8482),
352 'alefsym;' => chr(8501),
353 'larr;' => chr(8592),
354 'uarr;' => chr(8593),
355 'rarr;' => chr(8594),
356 'darr;' => chr(8595),
357 'harr;' => chr(8596),
358 'crarr;' => chr(8629),
359 'lArr;' => chr(8656),
360 'uArr;' => chr(8657),
361 'rArr;' => chr(8658),
362 'dArr;' => chr(8659),
363 'hArr;' => chr(8660),
364 'forall;' => chr(8704),
365 'part;' => chr(8706),
366 'exist;' => chr(8707),
367 'empty;' => chr(8709),
368 'nabla;' => chr(8711),
369 'isin;' => chr(8712),
370 'notin;' => chr(8713),
371 'ni;' => chr(8715),
372 'prod;' => chr(8719),
373 'sum;' => chr(8721),
374 'minus;' => chr(8722),
375 'lowast;' => chr(8727),
376 'radic;' => chr(8730),
377 'prop;' => chr(8733),
378 'infin;' => chr(8734),
379 'ang;' => chr(8736),
380 'and;' => chr(8743),
381 'or;' => chr(8744),
382 'cap;' => chr(8745),
383 'cup;' => chr(8746),
384 'int;' => chr(8747),
385 'there4;' => chr(8756),
386 'sim;' => chr(8764),
387 'cong;' => chr(8773),
388 'asymp;' => chr(8776),
389 'ne;' => chr(8800),
390 'equiv;' => chr(8801),
391 'le;' => chr(8804),
392 'ge;' => chr(8805),
393 'sub;' => chr(8834),
394 'sup;' => chr(8835),
395 'nsub;' => chr(8836),
396 'sube;' => chr(8838),
397 'supe;' => chr(8839),
398 'oplus;' => chr(8853),
399 'otimes;' => chr(8855),
400 'perp;' => chr(8869),
401 'sdot;' => chr(8901),
402 'lceil;' => chr(8968),
403 'rceil;' => chr(8969),
404 'lfloor;' => chr(8970),
405 'rfloor;' => chr(8971),
406 'lang;' => chr(9001),
407 'rang;' => chr(9002),
408 'loz;' => chr(9674),
409 'spades;' => chr(9824),
410 'clubs;' => chr(9827),
411 'hearts;' => chr(9829),
412 'diams;' => chr(9830),
413 ) : ())
414);
415
416
417# Make the opposite mapping
418while (my($entity, $char) = each(%entity2char)) {
419 $entity =~ s/;\z//;
420 $char2entity{$char} = "&$entity;";
421}
422delete $char2entity{"'"}; # only one-way decoding
423
424# Fill in missing entities
425for (0 .. 255) {
426 next if exists $char2entity{chr($_)};
427 $char2entity{chr($_)} = "&#$_;";
428}
429
430my %subst; # compiled encoding regexps
431
432sub decode_entities_old
433{
434 my $array;
435 if (defined wantarray) {
436 $array = [@_]; # copy
437 } else {
438 $array = \@_; # modify in-place
439 }
440 my $c;
441 for (@$array) {
442 s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg;
443 s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg;
444 s/(&(\w+);?)/$entity2char{$2} || $1/eg;
445 }
446 wantarray ? @$array : $array->[0];
447}
448
449sub encode_entities
450{
451 my $ref;
452 if (defined wantarray) {
453 my $x = $_[0];
454 $ref = \$x; # copy
455 } else {
456 $ref = \$_[0]; # modify in-place
457 }
458 if (defined $_[1] and length $_[1]) {
459 unless (exists $subst{$_[1]}) {
460 # Because we can't compile regex we fake it with a cached sub
461 my $code = "sub {\$_[0] =~ s/([$_[1]])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
462 $subst{$_[1]} = eval $code;
463 die( $@ . " while trying to turn range: \"$_[1]\"\n "
464 . "into code: $code\n "
465 ) if $@;
466 }
467 &{$subst{$_[1]}}($$ref);
468 } else {
469 # Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
470 $$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
471 }
472 $$ref;
473}
474
475sub encode_entities_numeric {
476 local %char2entity;
477 return &encode_entities; # a goto &encode_entities wouldn't work
478}
479
480
481sub num_entity {
482 sprintf "&#x%X;", ord($_[0]);
483}
484
485# Set up aliases
486*encode = \&encode_entities;
487*encode_numeric = \&encode_entities_numeric;
488*encode_numerically = \&encode_entities_numeric;
489*decode = \&decode_entities;
490
4911;
Note: See TracBrowser for help on using the repository browser.