Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gsdl/branches/2.80/perllib/cpan/HTML/Entities.pm@ 14880

Last change on this file since 14880 was 14078, checked in by lh92, 17 years ago
Perl modules required for HTMLTidy
Property svn:keywords set to `Author Date Id Revision`
File size: 14.8 KB

Line
1	package HTML::Entities;
2
3	# $Id: Entities.pm 14078 2007-05-17 03:15:41Z lh92 $
4
5	=head1 NAME
6
7	HTML::Entities - Encode or decode strings with HTML entities
8
9	=head1 SYNOPSIS
10
11	use HTML::Entities;
12
13	$a = "Våre norske tegn bør &#230res";
14	decode_entities($a);
15	encode_entities($a, "\200-\377");
16
17	For example, this:
18
19	$input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
20	print encode_entities($input), "\n"
21
22	Prints this out:
23
24	vis-à-vis Beyoncé's naïve
25	papier-mâché résumé
26
27	=head1 DESCRIPTION
28
29	This module deals with encoding and decoding of strings with HTML
30	character entities. The module provides the following functions:
31
32	=over 4
33
34	=item decode_entities( $string, ... )
35
36	This routine replaces HTML entities found in the $string with the
37	corresponding Unicode character. Under perl 5.6 and earlier only
38	characters in the Latin-1 range are replaced. Unrecognized
39	entities are left alone.
40
41	If multiple strings are provided as argument they are each decoded
42	separately and the same number of strings are returned.
43
44	If called in void context the arguments are decoded in-place.
45
46	This routine is exported by default.
47
48	=item _decode_entities( $string, \%entity2char )
49
50	=item _decode_entities( $string, \%entity2char, $expand_prefix )
51
52	This will in-place replace HTML entities in $string. The %entity2char
53	hash must be provided. Named entities not found in the %entity2char
54	hash are left alone. Numeric entities are expanded unless their value
55	overflow.
56
57	The keys in %entity2char are the entity names to be expanded and their
58	values are what they should expand into. The values do not have to be
59	single character strings. If a key has ";" as suffix,
60	then occurrences in $string are only expanded if properly terminated
61	with ";". Entities without ";" will be expanded regardless of how
62	they are terminated for compatiblity with how common browsers treat
63	entities in the Latin-1 range.
64
65	If $expand_prefix is TRUE then entities without trailing ";" in
66	%entity2char will even be expanded as a prefix of a longer
67	unrecognized name. The longest matching name in %entity2char will be
68	used. This is mainly present for compatibility with an MSIE
69	misfeature.
70
71	$string = "foo&nbspbar";
72	_decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
73	print $string; # will print "foo bar"
74
75	This routine is exported by default.
76
77	=item encode_entities( $string )
78
79	=item encode_entities( $string, $unsafe_chars )
80
81	This routine replaces unsafe characters in $string with their entity
82	representation. A second argument can be given to specify which
83	characters to consider unsafe (i.e., which to escape). The default set
84	of characters to encode are control chars, high-bit chars, and the
85	C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >>
86	characters. But this, for example, would encode I<just> the
87	C<< < >>, C<< & >>, C<< > >>, and C<< " >> characters:
88
89	$encoded = encode_entities($input, '<>&"');
90
91	This routine is exported by default.
92
93	=item encode_entities_numeric( $string )
94
95	=item encode_entities_numeric( $string, $unsafe_chars )
96
97	This routine works just like encode_entities, except that the replacement
98	entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For
99	example, C<encode_entities("r\xF4le")> returns "rôle", but
100	C<encode_entities_numeric("r\xF4le")> returns "rôle".
101
102	This routine is I<not> exported by default. But you can always
103	export it with C<use HTML::Entities qw(encode_entities_numeric);>
104	or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
105
106	=back
107
108	All these routines modify the string passed as the first argument, if
109	called in a void context. In scalar and array contexts, the encoded or
110	decoded string is returned (without changing the input string).
111
112	If you prefer not to import these routines into your namespace, you can
113	call them as:
114
115	use HTML::Entities ();
116	$decoded = HTML::Entities::decode($a);
117	$encoded = HTML::Entities::encode($a);
118	$encoded = HTML::Entities::encode_numeric($a);
119
120	The module can also export the %char2entity and the %entity2char
121	hashes, which contain the mapping from all characters to the
122	corresponding entities (and vice versa, respectively).
123
124	=head1 COPYRIGHT
125
126	Copyright 1995-2006 Gisle Aas. All rights reserved.
127
128	This library is free software; you can redistribute it and/or
129	modify it under the same terms as Perl itself.
130
131	=cut
132
133	use strict;
134	use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
135	use vars qw(%entity2char %char2entity);
136
137	require 5.004;
138	require Exporter;
139	@ISA = qw(Exporter);
140
141	@EXPORT = qw(encode_entities decode_entities _decode_entities);
142	@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
143
144	$VERSION = sprintf("%d.%02d", q$Revision: 14078 $ =~ /(\d+)\.(\d+)/);
145	sub Version { $VERSION; }
146
147	require HTML::Parser; # for fast XS implemented decode_entities
148
149
150	%entity2char = (
151	# Some normal chars that have special meaning in SGML context
152	amp => '&', # ampersand
153	'gt' => '>', # greater than
154	'lt' => '<', # less than
155	quot => '"', # double quote
156	apos => "'", # single quote
157
158	# PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
159	AElig => chr(198), # capital AE diphthong (ligature)
160	Aacute => chr(193), # capital A, acute accent
161	Acirc => chr(194), # capital A, circumflex accent
162	Agrave => chr(192), # capital A, grave accent
163	Aring => chr(197), # capital A, ring
164	Atilde => chr(195), # capital A, tilde
165	Auml => chr(196), # capital A, dieresis or umlaut mark
166	Ccedil => chr(199), # capital C, cedilla
167	ETH => chr(208), # capital Eth, Icelandic
168	Eacute => chr(201), # capital E, acute accent
169	Ecirc => chr(202), # capital E, circumflex accent
170	Egrave => chr(200), # capital E, grave accent
171	Euml => chr(203), # capital E, dieresis or umlaut mark
172	Iacute => chr(205), # capital I, acute accent
173	Icirc => chr(206), # capital I, circumflex accent
174	Igrave => chr(204), # capital I, grave accent
175	Iuml => chr(207), # capital I, dieresis or umlaut mark
176	Ntilde => chr(209), # capital N, tilde
177	Oacute => chr(211), # capital O, acute accent
178	Ocirc => chr(212), # capital O, circumflex accent
179	Ograve => chr(210), # capital O, grave accent
180	Oslash => chr(216), # capital O, slash
181	Otilde => chr(213), # capital O, tilde
182	Ouml => chr(214), # capital O, dieresis or umlaut mark
183	THORN => chr(222), # capital THORN, Icelandic
184	Uacute => chr(218), # capital U, acute accent
185	Ucirc => chr(219), # capital U, circumflex accent
186	Ugrave => chr(217), # capital U, grave accent
187	Uuml => chr(220), # capital U, dieresis or umlaut mark
188	Yacute => chr(221), # capital Y, acute accent
189	aacute => chr(225), # small a, acute accent
190	acirc => chr(226), # small a, circumflex accent
191	aelig => chr(230), # small ae diphthong (ligature)
192	agrave => chr(224), # small a, grave accent
193	aring => chr(229), # small a, ring
194	atilde => chr(227), # small a, tilde
195	auml => chr(228), # small a, dieresis or umlaut mark
196	ccedil => chr(231), # small c, cedilla
197	eacute => chr(233), # small e, acute accent
198	ecirc => chr(234), # small e, circumflex accent
199	egrave => chr(232), # small e, grave accent
200	eth => chr(240), # small eth, Icelandic
201	euml => chr(235), # small e, dieresis or umlaut mark
202	iacute => chr(237), # small i, acute accent
203	icirc => chr(238), # small i, circumflex accent
204	igrave => chr(236), # small i, grave accent
205	iuml => chr(239), # small i, dieresis or umlaut mark
206	ntilde => chr(241), # small n, tilde
207	oacute => chr(243), # small o, acute accent
208	ocirc => chr(244), # small o, circumflex accent
209	ograve => chr(242), # small o, grave accent
210	oslash => chr(248), # small o, slash
211	otilde => chr(245), # small o, tilde
212	ouml => chr(246), # small o, dieresis or umlaut mark
213	szlig => chr(223), # small sharp s, German (sz ligature)
214	thorn => chr(254), # small thorn, Icelandic
215	uacute => chr(250), # small u, acute accent
216	ucirc => chr(251), # small u, circumflex accent
217	ugrave => chr(249), # small u, grave accent
218	uuml => chr(252), # small u, dieresis or umlaut mark
219	yacute => chr(253), # small y, acute accent
220	yuml => chr(255), # small y, dieresis or umlaut mark
221
222	# Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
223	copy => chr(169), # copyright sign
224	reg => chr(174), # registered sign
225	nbsp => chr(160), # non breaking space
226
227	# Additional ISO-8859/1 entities listed in rfc1866 (section 14)
228	iexcl => chr(161),
229	cent => chr(162),
230	pound => chr(163),
231	curren => chr(164),
232	yen => chr(165),
233	brvbar => chr(166),
234	sect => chr(167),
235	uml => chr(168),
236	ordf => chr(170),
237	laquo => chr(171),
238	'not' => chr(172), # not is a keyword in perl
239	shy => chr(173),
240	macr => chr(175),
241	deg => chr(176),
242	plusmn => chr(177),
243	sup1 => chr(185),
244	sup2 => chr(178),
245	sup3 => chr(179),
246	acute => chr(180),
247	micro => chr(181),
248	para => chr(182),
249	middot => chr(183),
250	cedil => chr(184),
251	ordm => chr(186),
252	raquo => chr(187),
253	frac14 => chr(188),
254	frac12 => chr(189),
255	frac34 => chr(190),
256	iquest => chr(191),
257	'times' => chr(215), # times is a keyword in perl
258	divide => chr(247),
259
260	( $] > 5.007 ? (
261	'OElig;' => chr(338),
262	'oelig;' => chr(339),
263	'Scaron;' => chr(352),
264	'scaron;' => chr(353),
265	'Yuml;' => chr(376),
266	'fnof;' => chr(402),
267	'circ;' => chr(710),
268	'tilde;' => chr(732),
269	'Alpha;' => chr(913),
270	'Beta;' => chr(914),
271	'Gamma;' => chr(915),
272	'Delta;' => chr(916),
273	'Epsilon;' => chr(917),
274	'Zeta;' => chr(918),
275	'Eta;' => chr(919),
276	'Theta;' => chr(920),
277	'Iota;' => chr(921),
278	'Kappa;' => chr(922),
279	'Lambda;' => chr(923),
280	'Mu;' => chr(924),
281	'Nu;' => chr(925),
282	'Xi;' => chr(926),
283	'Omicron;' => chr(927),
284	'Pi;' => chr(928),
285	'Rho;' => chr(929),
286	'Sigma;' => chr(931),
287	'Tau;' => chr(932),
288	'Upsilon;' => chr(933),
289	'Phi;' => chr(934),
290	'Chi;' => chr(935),
291	'Psi;' => chr(936),
292	'Omega;' => chr(937),
293	'alpha;' => chr(945),
294	'beta;' => chr(946),
295	'gamma;' => chr(947),
296	'delta;' => chr(948),
297	'epsilon;' => chr(949),
298	'zeta;' => chr(950),
299	'eta;' => chr(951),
300	'theta;' => chr(952),
301	'iota;' => chr(953),
302	'kappa;' => chr(954),
303	'lambda;' => chr(955),
304	'mu;' => chr(956),
305	'nu;' => chr(957),
306	'xi;' => chr(958),
307	'omicron;' => chr(959),
308	'pi;' => chr(960),
309	'rho;' => chr(961),
310	'sigmaf;' => chr(962),
311	'sigma;' => chr(963),
312	'tau;' => chr(964),
313	'upsilon;' => chr(965),
314	'phi;' => chr(966),
315	'chi;' => chr(967),
316	'psi;' => chr(968),
317	'omega;' => chr(969),
318	'thetasym;' => chr(977),
319	'upsih;' => chr(978),
320	'piv;' => chr(982),
321	'ensp;' => chr(8194),
322	'emsp;' => chr(8195),
323	'thinsp;' => chr(8201),
324	'zwnj;' => chr(8204),
325	'zwj;' => chr(8205),
326	'lrm;' => chr(8206),
327	'rlm;' => chr(8207),
328	'ndash;' => chr(8211),
329	'mdash;' => chr(8212),
330	'lsquo;' => chr(8216),
331	'rsquo;' => chr(8217),
332	'sbquo;' => chr(8218),
333	'ldquo;' => chr(8220),
334	'rdquo;' => chr(8221),
335	'bdquo;' => chr(8222),
336	'dagger;' => chr(8224),
337	'Dagger;' => chr(8225),
338	'bull;' => chr(8226),
339	'hellip;' => chr(8230),
340	'permil;' => chr(8240),
341	'prime;' => chr(8242),
342	'Prime;' => chr(8243),
343	'lsaquo;' => chr(8249),
344	'rsaquo;' => chr(8250),
345	'oline;' => chr(8254),
346	'frasl;' => chr(8260),
347	'euro;' => chr(8364),
348	'image;' => chr(8465),
349	'weierp;' => chr(8472),
350	'real;' => chr(8476),
351	'trade;' => chr(8482),
352	'alefsym;' => chr(8501),
353	'larr;' => chr(8592),
354	'uarr;' => chr(8593),
355	'rarr;' => chr(8594),
356	'darr;' => chr(8595),
357	'harr;' => chr(8596),
358	'crarr;' => chr(8629),
359	'lArr;' => chr(8656),
360	'uArr;' => chr(8657),
361	'rArr;' => chr(8658),
362	'dArr;' => chr(8659),
363	'hArr;' => chr(8660),
364	'forall;' => chr(8704),
365	'part;' => chr(8706),
366	'exist;' => chr(8707),
367	'empty;' => chr(8709),
368	'nabla;' => chr(8711),
369	'isin;' => chr(8712),
370	'notin;' => chr(8713),
371	'ni;' => chr(8715),
372	'prod;' => chr(8719),
373	'sum;' => chr(8721),
374	'minus;' => chr(8722),
375	'lowast;' => chr(8727),
376	'radic;' => chr(8730),
377	'prop;' => chr(8733),
378	'infin;' => chr(8734),
379	'ang;' => chr(8736),
380	'and;' => chr(8743),
381	'or;' => chr(8744),
382	'cap;' => chr(8745),
383	'cup;' => chr(8746),
384	'int;' => chr(8747),
385	'there4;' => chr(8756),
386	'sim;' => chr(8764),
387	'cong;' => chr(8773),
388	'asymp;' => chr(8776),
389	'ne;' => chr(8800),
390	'equiv;' => chr(8801),
391	'le;' => chr(8804),
392	'ge;' => chr(8805),
393	'sub;' => chr(8834),
394	'sup;' => chr(8835),
395	'nsub;' => chr(8836),
396	'sube;' => chr(8838),
397	'supe;' => chr(8839),
398	'oplus;' => chr(8853),
399	'otimes;' => chr(8855),
400	'perp;' => chr(8869),
401	'sdot;' => chr(8901),
402	'lceil;' => chr(8968),
403	'rceil;' => chr(8969),
404	'lfloor;' => chr(8970),
405	'rfloor;' => chr(8971),
406	'lang;' => chr(9001),
407	'rang;' => chr(9002),
408	'loz;' => chr(9674),
409	'spades;' => chr(9824),
410	'clubs;' => chr(9827),
411	'hearts;' => chr(9829),
412	'diams;' => chr(9830),
413	) : ())
414	);
415
416
417	# Make the opposite mapping
418	while (my($entity, $char) = each(%entity2char)) {
419	$entity =~ s/;\z//;
420	$char2entity{$char} = "&$entity;";
421	}
422	delete $char2entity{"'"}; # only one-way decoding
423
424	# Fill in missing entities
425	for (0 .. 255) {
426	next if exists $char2entity{chr($_)};
427	$char2entity{chr($_)} = "&#$_;";
428	}
429
430	my %subst; # compiled encoding regexps
431
432	sub decode_entities_old
433	{
434	my $array;
435	if (defined wantarray) {
436	$array = [@_]; # copy
437	} else {
438	$array = \@_; # modify in-place
439	}
440	my $c;
441	for (@$array) {
442	s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg;
443	s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg;
444	s/(&(\w+);?)/$entity2char{$2} \|\| $1/eg;
445	}
446	wantarray ? @$array : $array->[0];
447	}
448
449	sub encode_entities
450	{
451	my $ref;
452	if (defined wantarray) {
453	my $x = $_[0];
454	$ref = \$x; # copy
455	} else {
456	$ref = \$_[0]; # modify in-place
457	}
458	if (defined $_[1] and length $_[1]) {
459	unless (exists $subst{$_[1]}) {
460	# Because we can't compile regex we fake it with a cached sub
461	my $code = "sub {\$_[0] =~ s/([$_[1]])/\$char2entity{\$1} \|\| num_entity(\$1)/ge; }";
462	$subst{$_[1]} = eval $code;
463	die( $@ . " while trying to turn range: \"$_[1]\"\n "
464	. "into code: $code\n "
465	) if $@;
466	}
467	&{$subst{$_[1]}}($$ref);
468	} else {
469	# Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
470	$$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} \|\| num_entity($1)/ge;
471	}
472	$$ref;
473	}
474
475	sub encode_entities_numeric {
476	local %char2entity;
477	return &encode_entities; # a goto &encode_entities wouldn't work
478	}
479
480
481	sub num_entity {
482	sprintf "&#x%X;", ord($_[0]);
483	}
484
485	# Set up aliases
486	*encode = \&encode_entities;
487	*encode_numeric = \&encode_entities_numeric;
488	*encode_numerically = \&encode_entities_numeric;
489	*decode = \&decode_entities;
490
491	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: