source: gs2-extensions/parallel-building/trunk/src/perllib/cpan/HTML/Entities.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago

An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)

File size: 15.0 KB
Line 
1package HTML::Entities;
2
3# $Id: Entities.pm 22649 2010-08-17 02:03:34Z davidb $
4
5=head1 NAME
6
7HTML::Entities - Encode or decode strings with HTML entities
8
9=head1 SYNOPSIS
10
11 use HTML::Entities;
12
13 $a = "Våre norske tegn bør &#230res";
14 decode_entities($a);
15 encode_entities($a, "\200-\377");
16
17For example, this:
18
19 $input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
20 print encode_entities($input), "\n"
21
22Prints this out:
23
24 vis-à-vis Beyoncé's naïve
25 papier-mâché résumé
26
27=head1 DESCRIPTION
28
29This module deals with encoding and decoding of strings with HTML
30character entities. The module provides the following functions:
31
32=over 4
33
34=item decode_entities( $string, ... )
35
36This routine replaces HTML entities found in the $string with the
37corresponding Unicode character. Under perl 5.6 and earlier only
38characters in the Latin-1 range are replaced. Unrecognized
39entities are left alone.
40
41If multiple strings are provided as argument they are each decoded
42separately and the same number of strings are returned.
43
44If called in void context the arguments are decoded in-place.
45
46This routine is exported by default.
47
48=item _decode_entities( $string, \%entity2char )
49
50=item _decode_entities( $string, \%entity2char, $expand_prefix )
51
52This will in-place replace HTML entities in $string. The %entity2char
53hash must be provided. Named entities not found in the %entity2char
54hash are left alone. Numeric entities are expanded unless their value
55overflow.
56
57The keys in %entity2char are the entity names to be expanded and their
58values are what they should expand into. The values do not have to be
59single character strings. If a key has ";" as suffix,
60then occurrences in $string are only expanded if properly terminated
61with ";". Entities without ";" will be expanded regardless of how
62they are terminated for compatiblity with how common browsers treat
63entities in the Latin-1 range.
64
65If $expand_prefix is TRUE then entities without trailing ";" in
66%entity2char will even be expanded as a prefix of a longer
67unrecognized name. The longest matching name in %entity2char will be
68used. This is mainly present for compatibility with an MSIE
69misfeature.
70
71 $string = "foo&nbspbar";
72 _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
73 print $string; # will print "foo bar"
74
75This routine is exported by default.
76
77=item encode_entities( $string )
78
79=item encode_entities( $string, $unsafe_chars )
80
81This routine replaces unsafe characters in $string with their entity
82representation. A second argument can be given to specify which
83characters to consider unsafe (i.e., which to escape). The default set
84of characters to encode are control chars, high-bit chars, and the
85C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >>
86characters. But this, for example, would encode I<just> the
87C<< < >>, C<< & >>, C<< > >>, and C<< " >> characters:
88
89 $encoded = encode_entities($input, '<>&"');
90
91This routine is exported by default.
92
93=item encode_entities_numeric( $string )
94
95=item encode_entities_numeric( $string, $unsafe_chars )
96
97This routine works just like encode_entities, except that the replacement
98entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For
99example, C<encode_entities("r\xF4le")> returns "r&ocirc;le", but
100C<encode_entities_numeric("r\xF4le")> returns "r&#xF4;le".
101
102This routine is I<not> exported by default. But you can always
103export it with C<use HTML::Entities qw(encode_entities_numeric);>
104or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
105
106=back
107
108All these routines modify the string passed as the first argument, if
109called in a void context. In scalar and array contexts, the encoded or
110decoded string is returned (without changing the input string).
111
112If you prefer not to import these routines into your namespace, you can
113call them as:
114
115 use HTML::Entities ();
116 $decoded = HTML::Entities::decode($a);
117 $encoded = HTML::Entities::encode($a);
118 $encoded = HTML::Entities::encode_numeric($a);
119
120The module can also export the %char2entity and the %entity2char
121hashes, which contain the mapping from all characters to the
122corresponding entities (and vice versa, respectively).
123
124=head1 COPYRIGHT
125
126Copyright 1995-2006 Gisle Aas. All rights reserved.
127
128This library is free software; you can redistribute it and/or
129modify it under the same terms as Perl itself.
130
131=cut
132
133use strict;
134use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
135use vars qw(%entity2char %char2entity);
136
137require 5.004;
138require Exporter;
139@ISA = qw(Exporter);
140
141@EXPORT = qw(encode_entities decode_entities _decode_entities);
142@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
143
144# The version we (Greenstone) seem to be using looks like it's come
145# from CVS/SVN and so does not parse the following sprintf statement
146# => Doesn't appear to be used anywhere in this file, so commenting out
147#
148#$VERSION = sprintf("%d.%02d", q$Revision: 22649 $ =~ /(\d+)\.(\d+)/);
149#sub Version { $VERSION; }
150
151require HTML::Parser; # for fast XS implemented decode_entities
152
153
154%entity2char = (
155 # Some normal chars that have special meaning in SGML context
156 amp => '&', # ampersand
157'gt' => '>', # greater than
158'lt' => '<', # less than
159 quot => '"', # double quote
160 apos => "'", # single quote
161
162 # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
163 AElig => chr(198), # capital AE diphthong (ligature)
164 Aacute => chr(193), # capital A, acute accent
165 Acirc => chr(194), # capital A, circumflex accent
166 Agrave => chr(192), # capital A, grave accent
167 Aring => chr(197), # capital A, ring
168 Atilde => chr(195), # capital A, tilde
169 Auml => chr(196), # capital A, dieresis or umlaut mark
170 Ccedil => chr(199), # capital C, cedilla
171 ETH => chr(208), # capital Eth, Icelandic
172 Eacute => chr(201), # capital E, acute accent
173 Ecirc => chr(202), # capital E, circumflex accent
174 Egrave => chr(200), # capital E, grave accent
175 Euml => chr(203), # capital E, dieresis or umlaut mark
176 Iacute => chr(205), # capital I, acute accent
177 Icirc => chr(206), # capital I, circumflex accent
178 Igrave => chr(204), # capital I, grave accent
179 Iuml => chr(207), # capital I, dieresis or umlaut mark
180 Ntilde => chr(209), # capital N, tilde
181 Oacute => chr(211), # capital O, acute accent
182 Ocirc => chr(212), # capital O, circumflex accent
183 Ograve => chr(210), # capital O, grave accent
184 Oslash => chr(216), # capital O, slash
185 Otilde => chr(213), # capital O, tilde
186 Ouml => chr(214), # capital O, dieresis or umlaut mark
187 THORN => chr(222), # capital THORN, Icelandic
188 Uacute => chr(218), # capital U, acute accent
189 Ucirc => chr(219), # capital U, circumflex accent
190 Ugrave => chr(217), # capital U, grave accent
191 Uuml => chr(220), # capital U, dieresis or umlaut mark
192 Yacute => chr(221), # capital Y, acute accent
193 aacute => chr(225), # small a, acute accent
194 acirc => chr(226), # small a, circumflex accent
195 aelig => chr(230), # small ae diphthong (ligature)
196 agrave => chr(224), # small a, grave accent
197 aring => chr(229), # small a, ring
198 atilde => chr(227), # small a, tilde
199 auml => chr(228), # small a, dieresis or umlaut mark
200 ccedil => chr(231), # small c, cedilla
201 eacute => chr(233), # small e, acute accent
202 ecirc => chr(234), # small e, circumflex accent
203 egrave => chr(232), # small e, grave accent
204 eth => chr(240), # small eth, Icelandic
205 euml => chr(235), # small e, dieresis or umlaut mark
206 iacute => chr(237), # small i, acute accent
207 icirc => chr(238), # small i, circumflex accent
208 igrave => chr(236), # small i, grave accent
209 iuml => chr(239), # small i, dieresis or umlaut mark
210 ntilde => chr(241), # small n, tilde
211 oacute => chr(243), # small o, acute accent
212 ocirc => chr(244), # small o, circumflex accent
213 ograve => chr(242), # small o, grave accent
214 oslash => chr(248), # small o, slash
215 otilde => chr(245), # small o, tilde
216 ouml => chr(246), # small o, dieresis or umlaut mark
217 szlig => chr(223), # small sharp s, German (sz ligature)
218 thorn => chr(254), # small thorn, Icelandic
219 uacute => chr(250), # small u, acute accent
220 ucirc => chr(251), # small u, circumflex accent
221 ugrave => chr(249), # small u, grave accent
222 uuml => chr(252), # small u, dieresis or umlaut mark
223 yacute => chr(253), # small y, acute accent
224 yuml => chr(255), # small y, dieresis or umlaut mark
225
226 # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
227 copy => chr(169), # copyright sign
228 reg => chr(174), # registered sign
229 nbsp => chr(160), # non breaking space
230
231 # Additional ISO-8859/1 entities listed in rfc1866 (section 14)
232 iexcl => chr(161),
233 cent => chr(162),
234 pound => chr(163),
235 curren => chr(164),
236 yen => chr(165),
237 brvbar => chr(166),
238 sect => chr(167),
239 uml => chr(168),
240 ordf => chr(170),
241 laquo => chr(171),
242'not' => chr(172), # not is a keyword in perl
243 shy => chr(173),
244 macr => chr(175),
245 deg => chr(176),
246 plusmn => chr(177),
247 sup1 => chr(185),
248 sup2 => chr(178),
249 sup3 => chr(179),
250 acute => chr(180),
251 micro => chr(181),
252 para => chr(182),
253 middot => chr(183),
254 cedil => chr(184),
255 ordm => chr(186),
256 raquo => chr(187),
257 frac14 => chr(188),
258 frac12 => chr(189),
259 frac34 => chr(190),
260 iquest => chr(191),
261'times' => chr(215), # times is a keyword in perl
262 divide => chr(247),
263
264 ( $] > 5.007 ? (
265 'OElig;' => chr(338),
266 'oelig;' => chr(339),
267 'Scaron;' => chr(352),
268 'scaron;' => chr(353),
269 'Yuml;' => chr(376),
270 'fnof;' => chr(402),
271 'circ;' => chr(710),
272 'tilde;' => chr(732),
273 'Alpha;' => chr(913),
274 'Beta;' => chr(914),
275 'Gamma;' => chr(915),
276 'Delta;' => chr(916),
277 'Epsilon;' => chr(917),
278 'Zeta;' => chr(918),
279 'Eta;' => chr(919),
280 'Theta;' => chr(920),
281 'Iota;' => chr(921),
282 'Kappa;' => chr(922),
283 'Lambda;' => chr(923),
284 'Mu;' => chr(924),
285 'Nu;' => chr(925),
286 'Xi;' => chr(926),
287 'Omicron;' => chr(927),
288 'Pi;' => chr(928),
289 'Rho;' => chr(929),
290 'Sigma;' => chr(931),
291 'Tau;' => chr(932),
292 'Upsilon;' => chr(933),
293 'Phi;' => chr(934),
294 'Chi;' => chr(935),
295 'Psi;' => chr(936),
296 'Omega;' => chr(937),
297 'alpha;' => chr(945),
298 'beta;' => chr(946),
299 'gamma;' => chr(947),
300 'delta;' => chr(948),
301 'epsilon;' => chr(949),
302 'zeta;' => chr(950),
303 'eta;' => chr(951),
304 'theta;' => chr(952),
305 'iota;' => chr(953),
306 'kappa;' => chr(954),
307 'lambda;' => chr(955),
308 'mu;' => chr(956),
309 'nu;' => chr(957),
310 'xi;' => chr(958),
311 'omicron;' => chr(959),
312 'pi;' => chr(960),
313 'rho;' => chr(961),
314 'sigmaf;' => chr(962),
315 'sigma;' => chr(963),
316 'tau;' => chr(964),
317 'upsilon;' => chr(965),
318 'phi;' => chr(966),
319 'chi;' => chr(967),
320 'psi;' => chr(968),
321 'omega;' => chr(969),
322 'thetasym;' => chr(977),
323 'upsih;' => chr(978),
324 'piv;' => chr(982),
325 'ensp;' => chr(8194),
326 'emsp;' => chr(8195),
327 'thinsp;' => chr(8201),
328 'zwnj;' => chr(8204),
329 'zwj;' => chr(8205),
330 'lrm;' => chr(8206),
331 'rlm;' => chr(8207),
332 'ndash;' => chr(8211),
333 'mdash;' => chr(8212),
334 'lsquo;' => chr(8216),
335 'rsquo;' => chr(8217),
336 'sbquo;' => chr(8218),
337 'ldquo;' => chr(8220),
338 'rdquo;' => chr(8221),
339 'bdquo;' => chr(8222),
340 'dagger;' => chr(8224),
341 'Dagger;' => chr(8225),
342 'bull;' => chr(8226),
343 'hellip;' => chr(8230),
344 'permil;' => chr(8240),
345 'prime;' => chr(8242),
346 'Prime;' => chr(8243),
347 'lsaquo;' => chr(8249),
348 'rsaquo;' => chr(8250),
349 'oline;' => chr(8254),
350 'frasl;' => chr(8260),
351 'euro;' => chr(8364),
352 'image;' => chr(8465),
353 'weierp;' => chr(8472),
354 'real;' => chr(8476),
355 'trade;' => chr(8482),
356 'alefsym;' => chr(8501),
357 'larr;' => chr(8592),
358 'uarr;' => chr(8593),
359 'rarr;' => chr(8594),
360 'darr;' => chr(8595),
361 'harr;' => chr(8596),
362 'crarr;' => chr(8629),
363 'lArr;' => chr(8656),
364 'uArr;' => chr(8657),
365 'rArr;' => chr(8658),
366 'dArr;' => chr(8659),
367 'hArr;' => chr(8660),
368 'forall;' => chr(8704),
369 'part;' => chr(8706),
370 'exist;' => chr(8707),
371 'empty;' => chr(8709),
372 'nabla;' => chr(8711),
373 'isin;' => chr(8712),
374 'notin;' => chr(8713),
375 'ni;' => chr(8715),
376 'prod;' => chr(8719),
377 'sum;' => chr(8721),
378 'minus;' => chr(8722),
379 'lowast;' => chr(8727),
380 'radic;' => chr(8730),
381 'prop;' => chr(8733),
382 'infin;' => chr(8734),
383 'ang;' => chr(8736),
384 'and;' => chr(8743),
385 'or;' => chr(8744),
386 'cap;' => chr(8745),
387 'cup;' => chr(8746),
388 'int;' => chr(8747),
389 'there4;' => chr(8756),
390 'sim;' => chr(8764),
391 'cong;' => chr(8773),
392 'asymp;' => chr(8776),
393 'ne;' => chr(8800),
394 'equiv;' => chr(8801),
395 'le;' => chr(8804),
396 'ge;' => chr(8805),
397 'sub;' => chr(8834),
398 'sup;' => chr(8835),
399 'nsub;' => chr(8836),
400 'sube;' => chr(8838),
401 'supe;' => chr(8839),
402 'oplus;' => chr(8853),
403 'otimes;' => chr(8855),
404 'perp;' => chr(8869),
405 'sdot;' => chr(8901),
406 'lceil;' => chr(8968),
407 'rceil;' => chr(8969),
408 'lfloor;' => chr(8970),
409 'rfloor;' => chr(8971),
410 'lang;' => chr(9001),
411 'rang;' => chr(9002),
412 'loz;' => chr(9674),
413 'spades;' => chr(9824),
414 'clubs;' => chr(9827),
415 'hearts;' => chr(9829),
416 'diams;' => chr(9830),
417 ) : ())
418);
419
420
421# Make the opposite mapping
422while (my($entity, $char) = each(%entity2char)) {
423 $entity =~ s/;\z//;
424 $char2entity{$char} = "&$entity;";
425}
426delete $char2entity{"'"}; # only one-way decoding
427
428# Fill in missing entities
429for (0 .. 255) {
430 next if exists $char2entity{chr($_)};
431 $char2entity{chr($_)} = "&#$_;";
432}
433
434my %subst; # compiled encoding regexps
435
436sub decode_entities_old
437{
438 my $array;
439 if (defined wantarray) {
440 $array = [@_]; # copy
441 } else {
442 $array = \@_; # modify in-place
443 }
444 my $c;
445 for (@$array) {
446 s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg;
447 s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg;
448 s/(&(\w+);?)/$entity2char{$2} || $1/eg;
449 }
450 wantarray ? @$array : $array->[0];
451}
452
453sub encode_entities
454{
455 my $ref;
456 if (defined wantarray) {
457 my $x = $_[0];
458 $ref = \$x; # copy
459 } else {
460 $ref = \$_[0]; # modify in-place
461 }
462 if (defined $_[1] and length $_[1]) {
463 unless (exists $subst{$_[1]}) {
464 # Because we can't compile regex we fake it with a cached sub
465 my $code = "sub {\$_[0] =~ s/([$_[1]])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
466 $subst{$_[1]} = eval $code;
467 die( $@ . " while trying to turn range: \"$_[1]\"\n "
468 . "into code: $code\n "
469 ) if $@;
470 }
471 &{$subst{$_[1]}}($$ref);
472 } else {
473 # Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
474 $$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
475 }
476 $$ref;
477}
478
479sub encode_entities_numeric {
480 local %char2entity;
481 return &encode_entities; # a goto &encode_entities wouldn't work
482}
483
484
485sub num_entity {
486 sprintf "&#x%X;", ord($_[0]);
487}
488
489# Set up aliases
490*encode = \&encode_entities;
491*encode_numeric = \&encode_entities_numeric;
492*encode_numerically = \&encode_entities_numeric;
493*decode = \&decode_entities;
494
4951;
Note: See TracBrowser for help on using the repository browser.