Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

ghtml.pm

Last change on this file was 35167, checked in by kjdon, 3 years ago
getcharequiv, which replaces entities with utf8 or characters, can now be told to keep surrogate entities, which can then be processed by desurrogate, which replaces a surrogate pair with the character
Property svn:keywords set to `Author Date Id Revision`
File size: 17.2 KB

Line
1	###########################################################################
2	#
3	# ghtml.pm -- this used to be called html.pm but it clashed
4	# with the existing html module under windows
5	#
6	# A component of the Greenstone digital library software
7	# from the New Zealand Digital Library Project at the
8	# University of Waikato, New Zealand.
9	#
10	# Copyright (C) 1999 New Zealand Digital Library Project
11	#
12	# This program is free software; you can redistribute it and/or modify
13	# it under the terms of the GNU General Public License as published by
14	# the Free Software Foundation; either version 2 of the License, or
15	# (at your option) any later version.
16	#
17	# This program is distributed in the hope that it will be useful,
18	# but WITHOUT ANY WARRANTY; without even the implied warranty of
19	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20	# GNU General Public License for more details.
21	#
22	# You should have received a copy of the GNU General Public License
23	# along with this program; if not, write to the Free Software
24	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25	#
26	###########################################################################
27
28	package ghtml;
29
30	use strict;
31	use unicode;
32
33	# htmlsafe(TEXT)
34	#
35	# Converts SGML meta characters in TEXT to entity references.
36	#
37	sub htmlsafe
38	{
39	$_[0] =~ s/&/&/osg;
40	$_[0] =~ s/</</osg;
41	$_[0] =~ s/>/>/osg;
42	$_[0] =~ s/\"/"/osg;
43	}
44
45	# unescape_html(TEXT)
46	#
47	# Converts HTML entities into their original form.
48	#
49	sub unescape_html
50	{
51	my ($html) = @_;
52
53	$html =~ s/&/&/osg;
54	$html =~ s/</</osg;
55	$html =~ s/>/>/osg;
56	$html =~ s/"/\"/osg;
57
58	return $html;
59	}
60
61	# urlsafe(TEXT)
62	#
63	# Converts characters not allowed in a URL to their hex representation.
64	#
65	sub urlsafe
66	{
67	# protect any hash's that are part of an entity, e.g. a
68	$_[0] =~ s/&#(.*?);/&%23$1;/g;
69
70	# and the usual suspects
71	$_[0] =~ s/[\x09\x20\x22\x3c\x3e\x5b\x5c\x5d\x5e\x60\x7b\x7c\x7d\x7e\?\=\&\+_\/]/sprintf("%%%2x", ord($&))/gse;
72	}
73
74
75	# named entry to the standard html font
76	my %charnetosf = ("Agrave"=> "192", "Aacute"=> "193", "Acirc" => "194", "Atilde"=> "195",
77	"Auml" => "196", "Aring" => "197", "AElig" => "198", "Ccedil"=> "199",
78	"Egrave"=> "200", "Eacute"=> "201", "Ecirc" => "202", "Euml" => "203",
79	"Igrave"=> "204", "Iacute"=> "205", "Icirc" => "206", "Iuml" => "207",
80	"ETH" => "208", "Ntilde"=> "209", "Ograve"=> "210", "Oacute"=> "211",
81	"Ocirc" => "212", "Otilde"=> "213", "Ouml" => "214",
82	"Oslash"=> "216", "Ugrave"=> "217", "Uacute"=> "218", "Ucirc" => "219",
83	"Uuml" => "220", "Yacute"=> "221", "THORN" => "222", "szlig" => "223",
84	"agrave"=> "224", "aacute"=> "225", "acirc" => "226", "atilde"=> "227",
85	"auml" => "228", "aring" => "229", "aelig" => "230", "ccedil"=> "231",
86	"egrave"=> "232", "eacute"=> "233", "ecirc" => "234", "euml" => "235",
87	"igrave"=> "236", "iacute"=> "237", "icirc" => "238", "iuml" => "239",
88	"eth" => "240", "ntilde"=> "241", "ograve"=> "242", "oacute"=> "243",
89	"ocirc" => "244", "otilde"=> "245", "ouml" => "246",
90	"oslash"=> "248", "ugrave"=> "249", "uacute"=> "250", "ucirc" => "251",
91	"uuml" => "252", "yacute"=> "253", "thorn" => "254", "yuml" => "255");
92
93	my %symnetosf = ("quot" => "34", "amp" => "38", "lt" => "60", "gt" => "62",
94	"nbsp" => "160", "iexcl" => "161", "cent" => "162", "pound" => "163",
95	"curren"=> "164", "yen" => "165", "brvbar"=> "166", "sect" => "167",
96	"uml" => "168", "copy" => "169", "ordf" => "170", "laquo" => "171",
97	"not" => "172", "shy" => "173", "reg" => "174", "macr" => "175",
98	"deg" => "176", "plusmn"=> "177", "sup2" => "178", "sup3" => "179",
99	"acute" => "180", "micro" => "181", "para" => "182", "middot"=> "183",
100	"cedil" => "184", "sup1" => "185", "ordm" => "186", "raquo" => "187",
101	"frac14"=> "188", "frac12"=> "189", "frac34"=> "190", "iquest"=> "191",
102	"times" => "215", "divide"=> "247");
103
104
105
106	# standard font to plain text
107	my %sftotxt = ("32" => " ", "33" => "!", "34" => "\"", "35" => "\#", "36" => "\$",
108	"37" => "\%", "38" => "&", "39" => "'", "40" => "(", "41" => ")",
109	"42" => "*", "43" => "+", "44" => ",", "45" => "-", "46" => ".",
110	"47" => "/", "48" => "0", "49" => "1", "50" => "2", "51" => "3",
111	"52" => "4", "53" => "5", "54" => "6", "55" => "7", "56" => "8",
112	"57" => "9", "58" => ":", "59" => ";", "60" => "<", "61" => "=",
113	"62" => ">", "63" => "?", "64" => "\@", "65" => "A", "66" => "B",
114	"57" => "9", "58" => ":", "59" => ";", "61" => "=",
115	"63" => "?", "64" => "\@", "65" => "A", "66" => "B",
116	"67" => "C", "68" => "D", "69" => "E", "70" => "F", "71" => "G",
117	"72" => "H", "73" => "I", "74" => "J", "75" => "K", "76" => "L",
118	"77" => "M", "78" => "N", "79" => "O", "80" => "P", "81" => "Q",
119	"82" => "R", "83" => "S", "84" => "T", "85" => "U", "86" => "V",
120	"87" => "W", "88" => "X", "89" => "Y", "90" => "Z", "91" => "[",
121	"92" => "\\", "93" => "]", "94" => "^", "95" => "_", "96" => "`",
122	"97" => "a", "98" => "b", "99" => "c", "100" => "d", "101" => "e",
123	"102" => "f", "103" => "g", "104" => "h", "105" => "i", "106" => "j",
124	"107" => "k", "108" => "l", "109" => "m", "110" => "n", "111" => "o",
125	"112" => "p", "113" => "q", "114" => "r", "115" => "s", "116" => "t",
126	"117" => "u", "118" => "v", "119" => "w", "120" => "x", "121" => "y",
127	"122" => "z", "123" => "{", "124" => "\|", "125" => "}", "126" => "~",
128	"130" => ",", "131" => "f", "132" => "\"", "133" => "...", "139" => "<",
129	"140" => "OE", "145" => "'", "146" => "'", "147" => "\"", "148" => "\"",
130	"149" => "o", "150" => "--", "151" => "-", "152" => "~", "153" => "TM",
131	"155" => ">", "156" => "oe", "159" => "Y", "160" => " ", "178" => "2",
132	"179" => "3", "185" => "1", "188" => "1/4", "189" => "1/2", "190" => "3/4",
133	"192" => "A", "193" => "A", "194" => "A", "195" => "A", "196" => "A",
134	"197" => "A", "198" => "AE", "199" => "C", "200" => "E", "201" => "E",
135	"202" => "E", "203" => "E", "204" => "I", "205" => "I", "206" => "I",
136	"207" => "I", "208" => "D", "209" => "N", "210" => "O", "211" => "O",
137	"212" => "O", "213" => "O", "214" => "O", "215" => "*", "216" => "O",
138	"217" => "U", "218" => "U", "219" => "U", "220" => "U", "221" => "Y",
139	"223" => "ss", "224" => "a", "225" => "a", "226" => "a", "227" => "a",
140	"228" => "a", "229" => "a", "230" => "ae", "231" => "c", "232" => "e",
141	"233" => "e", "234" => "e", "235" => "e", "236" => "i", "237" => "i",
142	"238" => "i", "239" => "i", "241" => "n", "242" => "o", "243" => "o",
143	"244" => "o", "245" => "o", "246" => "o", "247" => "/", "248" => "o",
144	"249" => "u", "250" => "u", "251" => "u", "252" => "u", "253" => "y",
145	"255" => "y", "8218" => ",");
146
147
148	my %mime_type = ("ai"=>"application/postscript", "aif"=>"audio/x-aiff",
149	"aifc"=>"audio/x-aiff", "aiff"=>"audio/x-aiff",
150	"au"=>"audio/basic", "avi"=>"video/x-msvideo",
151	"bcpio"=>"application/x-bcpio", "bin"=>"application/octet-stream",
152	"cdf"=>"application/x-netcdf", "class"=>"application/octet-stream",
153	"cpio"=>"application/x-cpio", "cpt"=>"application/mac-compactpro",
154	"csh"=>"application/x-csh", "dcr"=>"application/x-director",
155	"dir"=>"application/x-director", "dms"=>"application/octet-stream",
156	"doc"=>"application/msword", "dvi"=>"application/x-dvi",
157	"dxr"=>"application/x-director", "eps"=>"application/postscript",
158	"etx"=>"text/x-setext",
159	"exe"=>"application/octet-stream", "gif"=>"image/gif",
160	"gtar"=>"application/x-gtar", "hdf"=>"application/x-hdf",
161	"hqx"=>"application/mac-binhex40", "htm"=>"text/html",
162	"html"=>"text/html", "ice"=>"x-conference/x-cooltalk",
163	"ief"=>"image/ief", "jpe"=>"image/jpeg",
164	"jpeg"=>"image/jpeg", "jpg"=>"image/jpeg",
165	"kar"=>"audio/midi", "latex"=>"application/x-latex",
166	"lha"=>"application/octet-stream", "lzh"=>"application/octet-stream",
167	"man"=>"application/x-troff-man", "mcf"=>"image/vasa",
168	"me"=>"application/x-troff-me", "mid"=>"audio/midi",
169	"midi"=>"audio/midi", "mif"=>"application/x-mif",
170	"mov"=>"video/quicktime", "movie"=>"video/x-sgi-movie",
171	"mp2"=>"audio/mpeg", "mpe"=>"video/mpeg",
172	"mpeg"=>"video/mpeg", "mpg"=>"video/mpeg",
173	"mpga"=>"audio/mpeg", "ms"=>"application/x-troff-ms",
174	"nc"=>"application/x-netcdf", "oda"=>"application/oda",
175	"pbm"=>"image/x-portable-bitmap", "pdb"=>"chemical/x-pdb",
176	"pdf"=>"application/pdf", "pgm"=>"image/x-portable-graymap",
177	"png"=>"image/png", "pnm"=>"image/x-portable-anymap",
178	"ppm"=>"image/x-portable-pixmap",
179	"ppt"=>"application/vnd.ms-powerpoint",
180	"ps"=>"application/postscript", "qt"=>"video/quicktime",
181	"ra"=>"audio/x-realaudio", "ram"=>"audio/x-pn-realaudio",
182	"ras"=>"image/x-cmu-raster", "rgb"=>"image/x-rgb",
183	"roff"=>"application/x-troff", "rpm"=>"audio/x-pn-realaudio-plugin",
184	"rtf"=>"application/rtf", "rtx"=>"text/richtext",
185	"sgm"=>"text/x-sgml", "sgml"=>"text/x-sgml",
186	"sh"=>"application/x-sh", "shar"=>"application/x-shar",
187	"sit"=>"application/x-stuffit", "skd"=>"application/x-koan",
188	"skm"=>"application/x-koan", "skp"=>"application/x-koan",
189	"skt"=>"application/x-koan", "snd"=>"audio/basic",
190	"src"=>"application/x-wais-source", "sv4cpio"=>"application/x-sv4cpio",
191	"sv4crc"=>"application/x-sv4crc", "t"=>"application/x-troff",
192	"tar"=>"application/x-tar", "tcl"=>"application/x-tcl",
193	"tex"=>"application/x-tex", "texi"=>"application/x-texinfo",
194	"texinfo"=>"application/x-texinfo", "tif"=>"image/tiff",
195	"tiff"=>"image/tiff", "tr"=>"application/x-troff",
196	"tsv"=>"text/tab-separated-values", "txt"=>"text/plain",
197	"ustar"=>"application/x-ustar", "vcd"=>"application/x-cdlink",
198	"vrml"=>"x-world/x-vrml", "wav"=>"audio/x-wav",
199	"wrl"=>"x-world/x-vrml", "xbm"=>"image/x-xbitmap",
200	"xls"=>"application/vnd.ms-excel",
201	"xpm"=>"image/x-xpixmap", "xwd"=>"image/x-xwindowdump",
202	"xyz"=>"chemical/x-pdb", "zip"=>"application/zip");
203
204
205	# This gets passed in 2 entities, with &# and ; stripped off. If they are a valid surrogate pair,
206	# it returns the character they represent
207	sub desurrogate {
208	my ($hi, $lo) = @_;
209
210	my $hi_code = undef;
211	my $lo_code = undef;
212
213	if ($hi =~ m/^0*(\d+)$/) {
214	$hi_code=$1;
215	}
216	elsif ($hi =~ m/^x([0-9A-F]+)$/i) {
217	$hi_code=hex($1);
218	}
219	if ($lo =~ m/^0*(\d+)$/) {
220	$lo_code=$1;
221	}
222	elsif ($lo =~ m/^x([0-9A-F]+)$/i) {
223	$lo_code=hex($1);
224	}
225	if (!defined $hi_code && !defined $lo_code) {
226	# wasn't proper surrogate pair
227	print STDERR "WARNING, &#$hi; &#$lo; is not a valid surrogate pair, returning '?'\n";
228	return "?";
229
230	}
231	#([\x{D800}-\x{DBFF}])([\x{DC00}-\x{DFFF}])
232	if($hi_code >= 0xD800 && $hi_code <= 0xDBFF && $lo_code >= 0xDC00 && $lo_code <= 0xDFFF) {
233	#print STDERR "Found surrogate pair $hi_code, $lo_code\n";
234	my $codepoint = 0x10000 + ($hi_code - 0xD800) * 0x400 + ($lo_code - 0xDC00);
235	my $char_equiv = &unicode::unicode2utf8([$codepoint]);
236	$char_equiv = Encode::decode("utf8",$char_equiv);
237	return $char_equiv;
238	} else {
239	print STDERR "WARNING, &#$hi_code; &#$lo_code; is not a valid surrogate pair, returning '?'\n";
240	return "?";
241	}
242	}
243
244	#If you want to remove surrogate pairs before you process all the other entities, then you need more complicated lookahead system, to handle when the two entities you are looking at are not the pair.
245	# leaving this here for future reference, but its not used currently
246	# (?= is lookahead, can return capturing groups, but won't be consumed by a match
247	# (?:....)? non-capturing group that is optional
248	#$$textref =~ s/&\#([^;]+);(?=(?:&\#([^;]+);)?)/&ghtml::desurrogate($1,$2,1)/gseo;
249
250	# returns a surroage pair. assumes &# and ; have been stripped off the entity
251	# optional lookahead to get $lo
252	my $in_surrogate = 0;
253	sub preprocess_desurrogate_NOTUSED {
254	my ($hi, $lo, $and_decode) = @_;
255	print STDERR "in ghtml::desurrogate, $hi"; if (defined $lo) {print STDERR " $lo";} print STDERR "\n";
256	my $hi_code = undef;
257	my $lo_code = undef;
258
259	if ($in_surrogate) { # consume the second entity of the surrogate
260	$in_surrogate = 0;
261	return "";
262	}
263	if (!defined $lo) { # we are not part of a pair
264	return "&#$hi;";
265	}
266	if ($hi =~ m/^0*(\d+)$/) {
267	$hi_code=$1;
268	}
269	elsif ($hi =~ m/^x([0-9A-F]+)$/i) {
270	$hi_code=hex($1);
271	}
272	# are we the first part of a surrogate?
273	if (!defined $hi_code \|\| !($hi_code >= 0xD800 && $hi_code <= 0xDFFF)) {
274	# no, return the original
275	return "&#$hi;";
276	}
277	# check the second part - is that a surrogate part?
278	if ($lo =~ m/^0*(\d+)$/) {
279	$lo_code=$1;
280	}
281	elsif ($lo =~ m/^x([0-9A-F]+)$/i) {
282	$lo_code=hex($1);
283	}
284
285	if (!defined $lo_code \|\| !($lo_code>= 0xD800 && $lo_code <= 0xDFFF)) {
286	# not part of a surrogate
287	return "&#$hi;";
288	}
289
290	my $char_equiv = undef;
291	my $codepoint = 0x10000 + ($hi_code - 0xD800) * 0x400 + ($lo_code - 0xDC00);
292	$char_equiv = &unicode::unicode2utf8([$codepoint]);
293
294
295	if (!defined $char_equiv) {
296	return "&#$hi;";
297	}
298	else {
299	if ((defined $and_decode) && ($and_decode)) {
300	$char_equiv = Encode::decode("utf8",$char_equiv);
301	}
302	$in_surrogate=1;
303	print STDERR "found surrogate\n";
304	return $char_equiv;
305	}
306	}
307	# returns the character as a raw utf-8 character. It assumes that the
308	# & and ; have been stripped off the string.
309	# If and_decode is true, it returns the codepoint instead of utf8
310	# If keep_surrogates is true, leave the surrogate entities as is - for later processing with desurrogate.
311	sub getcharequiv {
312	my ($entity, $convertsymbols, $and_decode, $keep_surrogates) = @_;
313
314	$keep_surrogates = 0 unless defined $keep_surrogates;
315	my $char_equiv = undef;
316
317	# a numeric entity
318	my $code = undef;
319	if ($entity =~ m/^\#0*(\d+)$/) {
320	$code=$1;
321	}
322	elsif ($entity =~ m/^\#x([0-9A-F]+)$/i) {
323	$code=hex($1);
324	}
325
326	if (defined $code) {
327
328	# UTF-16 surrogate pairs
329	if($code >= 0xD800 && $code <= 0xDFFF) {
330	print STDERR "Warning: encountered the HTML entity \&#$code; which represents part of a UTF-16 surrogate pair, which is not supported in ghtml::getcharequiv(). ";
331	if ($keep_surrogates) {
332	print STDERR "Leaving as entity\n";
333	return "&$entity;";
334	}
335	else {
336	print STDERR "Replacing with '?'.\n";
337	$code = ord("?");
338	}
339	}
340
341	# non-standard Microsoft breakage, as usual
342	if ($code < 0x9f) { # code page 1252 uses reserved bytes
343	if ($code == 0x91) {$code=0x2018} # 145 = single left quote
344	elsif ($code == 0x92) {$code=0x2019} # 146 = single right quote
345	elsif ($code == 0x93) {$code=0x201c} # 147 = double left quote
346	elsif ($code == 0x94) {$code=0x201d} # 148 = double right quote
347	# ...
348	}
349	$char_equiv = &unicode::unicode2utf8([$code]);
350	}
351
352	# a named character entity
353	elsif (defined $charnetosf{$entity}) {
354	$char_equiv = &unicode::unicode2utf8([$charnetosf{$entity}]);
355	}
356
357	# a named symbol entity
358	elsif ($convertsymbols && defined $symnetosf{$entity}) {
359	$char_equiv = &unicode::unicode2utf8([$symnetosf{$entity}]);
360	}
361
362	if (!defined $char_equiv) {
363	return "&$entity;"; # unknown character
364	}
365	else {
366	if ((defined $and_decode) && ($and_decode)) {
367	$char_equiv = Encode::decode("utf8",$char_equiv);
368	}
369	return $char_equiv;
370	}
371	}
372
373	# convert character entities from named equivalents to html font
374	sub convertcharentities {
375	# args: the text that you want to convert
376
377	$_[0] =~ s/&([^;]+);/&getcharequiv($1,0)/gse;
378	}
379
380	# convert any entities from named equivalents to html font
381	sub convertallentities {
382	# args: the text that you want to convert
383
384	$_[0] =~ s/&([^;]+);/&getcharequiv($1,1)/gse;
385	}
386
387	sub html2txt {
388	# args: the text that you want converted to ascii,
389	# and whether to strip out sgml tags
390
391	# strip out sgml tags if needed
392	$_[0] =~ s/<[^>]*>//g if $_[1];
393
394	# convert the char entities to the standard html font
395	&convertcharentities($_[0]);
396
397	# convert the html character set to a plain ascii character set
398	my $pos = 0;
399	while ($pos < length($_[0])) {
400	my $charnum = ord(substr($_[0], $pos, 1));
401	if ($charnum >= 32) { # only convert characters above #32
402	my $replacechars = " ";
403	$replacechars = $sftotxt{$charnum} if defined $sftotxt{$charnum};
404	substr($_[0], $pos, 1) = $replacechars;
405	$pos += length ($replacechars);
406
407	} else {
408	$pos ++;
409	}
410	}
411	}
412
413
414	# look for mime.types (eg in /etc, or apache/conf directories), or have a look
415	# at <ftp://ftp.iana.org/in-notes/iana/assignments/media-types/> for defaults.
416	sub guess_mime_type {
417	my ($filename) = @_;
418	# make the filename lowercase, since the mimetypes hashmap looks for lowercase
419	$filename = lc($filename);
420
421	my ($fileext) = $filename =~ /\.(\w+)$/;
422	return "unknown" unless defined $fileext;
423
424	# else
425	my $mimetype = $mime_type{$fileext};
426	return $mimetype if (defined $mimetype);
427
428	return "unknown";
429	}
430
431
432	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/ghtml.pm

Download in other formats: