source: main/trunk/greenstone2/perllib/ghtml.pm

Last change on this file was 35167, checked in by kjdon, 3 years ago

getcharequiv, which replaces entities with utf8 or characters, can now be told to keep surrogate entities, which can then be processed by desurrogate, which replaces a surrogate pair with the character

  • Property svn:keywords set to Author Date Id Revision
File size: 17.2 KB
Line 
1###########################################################################
2#
3# ghtml.pm -- this used to be called html.pm but it clashed
4# with the existing html module under windows
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package ghtml;
29
30use strict;
31use unicode;
32
33# htmlsafe(TEXT)
34#
35# Converts SGML meta characters in TEXT to entity references.
36#
37sub htmlsafe
38{
39 $_[0] =~ s/&/&/osg;
40 $_[0] =~ s/</&lt;/osg;
41 $_[0] =~ s/>/&gt;/osg;
42 $_[0] =~ s/\"/&quot;/osg;
43}
44
45# unescape_html(TEXT)
46#
47# Converts HTML entities into their original form.
48#
49sub unescape_html
50{
51 my ($html) = @_;
52
53 $html =~ s/&amp;/&/osg;
54 $html =~ s/&lt;/</osg;
55 $html =~ s/&gt;/>/osg;
56 $html =~ s/&quot;/\"/osg;
57
58 return $html;
59}
60
61# urlsafe(TEXT)
62#
63# Converts characters not allowed in a URL to their hex representation.
64#
65sub urlsafe
66{
67 # protect any hash's that are part of an entity, e.g. &#097;
68 $_[0] =~ s/&#(.*?);/&%23$1;/g;
69
70 # and the usual suspects
71 $_[0] =~ s/[\x09\x20\x22\x3c\x3e\x5b\x5c\x5d\x5e\x60\x7b\x7c\x7d\x7e\?\=\&\+_\/]/sprintf("%%%2x", ord($&))/gse;
72}
73
74
75# named entry to the standard html font
76my %charnetosf = ("Agrave"=> "192", "Aacute"=> "193", "Acirc" => "194", "Atilde"=> "195",
77 "Auml" => "196", "Aring" => "197", "AElig" => "198", "Ccedil"=> "199",
78 "Egrave"=> "200", "Eacute"=> "201", "Ecirc" => "202", "Euml" => "203",
79 "Igrave"=> "204", "Iacute"=> "205", "Icirc" => "206", "Iuml" => "207",
80 "ETH" => "208", "Ntilde"=> "209", "Ograve"=> "210", "Oacute"=> "211",
81 "Ocirc" => "212", "Otilde"=> "213", "Ouml" => "214",
82 "Oslash"=> "216", "Ugrave"=> "217", "Uacute"=> "218", "Ucirc" => "219",
83 "Uuml" => "220", "Yacute"=> "221", "THORN" => "222", "szlig" => "223",
84 "agrave"=> "224", "aacute"=> "225", "acirc" => "226", "atilde"=> "227",
85 "auml" => "228", "aring" => "229", "aelig" => "230", "ccedil"=> "231",
86 "egrave"=> "232", "eacute"=> "233", "ecirc" => "234", "euml" => "235",
87 "igrave"=> "236", "iacute"=> "237", "icirc" => "238", "iuml" => "239",
88 "eth" => "240", "ntilde"=> "241", "ograve"=> "242", "oacute"=> "243",
89 "ocirc" => "244", "otilde"=> "245", "ouml" => "246",
90 "oslash"=> "248", "ugrave"=> "249", "uacute"=> "250", "ucirc" => "251",
91 "uuml" => "252", "yacute"=> "253", "thorn" => "254", "yuml" => "255");
92
93my %symnetosf = ("quot" => "34", "amp" => "38", "lt" => "60", "gt" => "62",
94 "nbsp" => "160", "iexcl" => "161", "cent" => "162", "pound" => "163",
95 "curren"=> "164", "yen" => "165", "brvbar"=> "166", "sect" => "167",
96 "uml" => "168", "copy" => "169", "ordf" => "170", "laquo" => "171",
97 "not" => "172", "shy" => "173", "reg" => "174", "macr" => "175",
98 "deg" => "176", "plusmn"=> "177", "sup2" => "178", "sup3" => "179",
99 "acute" => "180", "micro" => "181", "para" => "182", "middot"=> "183",
100 "cedil" => "184", "sup1" => "185", "ordm" => "186", "raquo" => "187",
101 "frac14"=> "188", "frac12"=> "189", "frac34"=> "190", "iquest"=> "191",
102 "times" => "215", "divide"=> "247");
103
104
105
106# standard font to plain text
107my %sftotxt = ("32" => " ", "33" => "!", "34" => "\"", "35" => "\#", "36" => "\$",
108 "37" => "\%", "38" => "&", "39" => "'", "40" => "(", "41" => ")",
109 "42" => "*", "43" => "+", "44" => ",", "45" => "-", "46" => ".",
110 "47" => "/", "48" => "0", "49" => "1", "50" => "2", "51" => "3",
111 "52" => "4", "53" => "5", "54" => "6", "55" => "7", "56" => "8",
112 "57" => "9", "58" => ":", "59" => ";", "60" => "<", "61" => "=",
113 "62" => ">", "63" => "?", "64" => "\@", "65" => "A", "66" => "B",
114 "57" => "9", "58" => ":", "59" => ";", "61" => "=",
115 "63" => "?", "64" => "\@", "65" => "A", "66" => "B",
116 "67" => "C", "68" => "D", "69" => "E", "70" => "F", "71" => "G",
117 "72" => "H", "73" => "I", "74" => "J", "75" => "K", "76" => "L",
118 "77" => "M", "78" => "N", "79" => "O", "80" => "P", "81" => "Q",
119 "82" => "R", "83" => "S", "84" => "T", "85" => "U", "86" => "V",
120 "87" => "W", "88" => "X", "89" => "Y", "90" => "Z", "91" => "[",
121 "92" => "\\", "93" => "]", "94" => "^", "95" => "_", "96" => "`",
122 "97" => "a", "98" => "b", "99" => "c", "100" => "d", "101" => "e",
123 "102" => "f", "103" => "g", "104" => "h", "105" => "i", "106" => "j",
124 "107" => "k", "108" => "l", "109" => "m", "110" => "n", "111" => "o",
125 "112" => "p", "113" => "q", "114" => "r", "115" => "s", "116" => "t",
126 "117" => "u", "118" => "v", "119" => "w", "120" => "x", "121" => "y",
127 "122" => "z", "123" => "{", "124" => "|", "125" => "}", "126" => "~",
128 "130" => ",", "131" => "f", "132" => "\"", "133" => "...", "139" => "<",
129 "140" => "OE", "145" => "'", "146" => "'", "147" => "\"", "148" => "\"",
130 "149" => "o", "150" => "--", "151" => "-", "152" => "~", "153" => "TM",
131 "155" => ">", "156" => "oe", "159" => "Y", "160" => " ", "178" => "2",
132 "179" => "3", "185" => "1", "188" => "1/4", "189" => "1/2", "190" => "3/4",
133 "192" => "A", "193" => "A", "194" => "A", "195" => "A", "196" => "A",
134 "197" => "A", "198" => "AE", "199" => "C", "200" => "E", "201" => "E",
135 "202" => "E", "203" => "E", "204" => "I", "205" => "I", "206" => "I",
136 "207" => "I", "208" => "D", "209" => "N", "210" => "O", "211" => "O",
137 "212" => "O", "213" => "O", "214" => "O", "215" => "*", "216" => "O",
138 "217" => "U", "218" => "U", "219" => "U", "220" => "U", "221" => "Y",
139 "223" => "ss", "224" => "a", "225" => "a", "226" => "a", "227" => "a",
140 "228" => "a", "229" => "a", "230" => "ae", "231" => "c", "232" => "e",
141 "233" => "e", "234" => "e", "235" => "e", "236" => "i", "237" => "i",
142 "238" => "i", "239" => "i", "241" => "n", "242" => "o", "243" => "o",
143 "244" => "o", "245" => "o", "246" => "o", "247" => "/", "248" => "o",
144 "249" => "u", "250" => "u", "251" => "u", "252" => "u", "253" => "y",
145 "255" => "y", "8218" => ",");
146
147
148my %mime_type = ("ai"=>"application/postscript", "aif"=>"audio/x-aiff",
149 "aifc"=>"audio/x-aiff", "aiff"=>"audio/x-aiff",
150 "au"=>"audio/basic", "avi"=>"video/x-msvideo",
151 "bcpio"=>"application/x-bcpio", "bin"=>"application/octet-stream",
152 "cdf"=>"application/x-netcdf", "class"=>"application/octet-stream",
153 "cpio"=>"application/x-cpio", "cpt"=>"application/mac-compactpro",
154 "csh"=>"application/x-csh", "dcr"=>"application/x-director",
155 "dir"=>"application/x-director", "dms"=>"application/octet-stream",
156 "doc"=>"application/msword", "dvi"=>"application/x-dvi",
157 "dxr"=>"application/x-director", "eps"=>"application/postscript",
158 "etx"=>"text/x-setext",
159 "exe"=>"application/octet-stream", "gif"=>"image/gif",
160 "gtar"=>"application/x-gtar", "hdf"=>"application/x-hdf",
161 "hqx"=>"application/mac-binhex40", "htm"=>"text/html",
162 "html"=>"text/html", "ice"=>"x-conference/x-cooltalk",
163 "ief"=>"image/ief", "jpe"=>"image/jpeg",
164 "jpeg"=>"image/jpeg", "jpg"=>"image/jpeg",
165 "kar"=>"audio/midi", "latex"=>"application/x-latex",
166 "lha"=>"application/octet-stream", "lzh"=>"application/octet-stream",
167 "man"=>"application/x-troff-man", "mcf"=>"image/vasa",
168 "me"=>"application/x-troff-me", "mid"=>"audio/midi",
169 "midi"=>"audio/midi", "mif"=>"application/x-mif",
170 "mov"=>"video/quicktime", "movie"=>"video/x-sgi-movie",
171 "mp2"=>"audio/mpeg", "mpe"=>"video/mpeg",
172 "mpeg"=>"video/mpeg", "mpg"=>"video/mpeg",
173 "mpga"=>"audio/mpeg", "ms"=>"application/x-troff-ms",
174 "nc"=>"application/x-netcdf", "oda"=>"application/oda",
175 "pbm"=>"image/x-portable-bitmap", "pdb"=>"chemical/x-pdb",
176 "pdf"=>"application/pdf", "pgm"=>"image/x-portable-graymap",
177 "png"=>"image/png", "pnm"=>"image/x-portable-anymap",
178 "ppm"=>"image/x-portable-pixmap",
179 "ppt"=>"application/vnd.ms-powerpoint",
180 "ps"=>"application/postscript", "qt"=>"video/quicktime",
181 "ra"=>"audio/x-realaudio", "ram"=>"audio/x-pn-realaudio",
182 "ras"=>"image/x-cmu-raster", "rgb"=>"image/x-rgb",
183 "roff"=>"application/x-troff", "rpm"=>"audio/x-pn-realaudio-plugin",
184 "rtf"=>"application/rtf", "rtx"=>"text/richtext",
185 "sgm"=>"text/x-sgml", "sgml"=>"text/x-sgml",
186 "sh"=>"application/x-sh", "shar"=>"application/x-shar",
187 "sit"=>"application/x-stuffit", "skd"=>"application/x-koan",
188 "skm"=>"application/x-koan", "skp"=>"application/x-koan",
189 "skt"=>"application/x-koan", "snd"=>"audio/basic",
190 "src"=>"application/x-wais-source", "sv4cpio"=>"application/x-sv4cpio",
191 "sv4crc"=>"application/x-sv4crc", "t"=>"application/x-troff",
192 "tar"=>"application/x-tar", "tcl"=>"application/x-tcl",
193 "tex"=>"application/x-tex", "texi"=>"application/x-texinfo",
194 "texinfo"=>"application/x-texinfo", "tif"=>"image/tiff",
195 "tiff"=>"image/tiff", "tr"=>"application/x-troff",
196 "tsv"=>"text/tab-separated-values", "txt"=>"text/plain",
197 "ustar"=>"application/x-ustar", "vcd"=>"application/x-cdlink",
198 "vrml"=>"x-world/x-vrml", "wav"=>"audio/x-wav",
199 "wrl"=>"x-world/x-vrml", "xbm"=>"image/x-xbitmap",
200 "xls"=>"application/vnd.ms-excel",
201 "xpm"=>"image/x-xpixmap", "xwd"=>"image/x-xwindowdump",
202 "xyz"=>"chemical/x-pdb", "zip"=>"application/zip");
203
204
205# This gets passed in 2 entities, with &# and ; stripped off. If they are a valid surrogate pair,
206# it returns the character they represent
207sub desurrogate {
208 my ($hi, $lo) = @_;
209
210 my $hi_code = undef;
211 my $lo_code = undef;
212
213 if ($hi =~ m/^0*(\d+)$/) {
214 $hi_code=$1;
215 }
216 elsif ($hi =~ m/^x([0-9A-F]+)$/i) {
217 $hi_code=hex($1);
218 }
219 if ($lo =~ m/^0*(\d+)$/) {
220 $lo_code=$1;
221 }
222 elsif ($lo =~ m/^x([0-9A-F]+)$/i) {
223 $lo_code=hex($1);
224 }
225 if (!defined $hi_code && !defined $lo_code) {
226 # wasn't proper surrogate pair
227 print STDERR "WARNING, &#$hi; &#$lo; is not a valid surrogate pair, returning '?'\n";
228 return "?";
229
230 }
231 #([\x{D800}-\x{DBFF}])([\x{DC00}-\x{DFFF}])
232 if($hi_code >= 0xD800 && $hi_code <= 0xDBFF && $lo_code >= 0xDC00 && $lo_code <= 0xDFFF) {
233 #print STDERR "Found surrogate pair $hi_code, $lo_code\n";
234 my $codepoint = 0x10000 + ($hi_code - 0xD800) * 0x400 + ($lo_code - 0xDC00);
235 my $char_equiv = &unicode::unicode2utf8([$codepoint]);
236 $char_equiv = Encode::decode("utf8",$char_equiv);
237 return $char_equiv;
238 } else {
239 print STDERR "WARNING, &#$hi_code; &#$lo_code; is not a valid surrogate pair, returning '?'\n";
240 return "?";
241 }
242}
243
244#If you want to remove surrogate pairs before you process all the other entities, then you need more complicated lookahead system, to handle when the two entities you are looking at are not the pair.
245# leaving this here for future reference, but its not used currently
246# (?= is lookahead, can return capturing groups, but won't be consumed by a match
247# (?:....)? non-capturing group that is optional
248#$$textref =~ s/&\#([^;]+);(?=(?:&\#([^;]+);)?)/&ghtml::desurrogate($1,$2,1)/gseo;
249
250# returns a surroage pair. assumes &# and ; have been stripped off the entity
251# optional lookahead to get $lo
252my $in_surrogate = 0;
253sub preprocess_desurrogate_NOTUSED {
254 my ($hi, $lo, $and_decode) = @_;
255 print STDERR "in ghtml::desurrogate, $hi"; if (defined $lo) {print STDERR " $lo";} print STDERR "\n";
256 my $hi_code = undef;
257 my $lo_code = undef;
258
259 if ($in_surrogate) { # consume the second entity of the surrogate
260 $in_surrogate = 0;
261 return "";
262 }
263 if (!defined $lo) { # we are not part of a pair
264 return "&#$hi;";
265 }
266 if ($hi =~ m/^0*(\d+)$/) {
267 $hi_code=$1;
268 }
269 elsif ($hi =~ m/^x([0-9A-F]+)$/i) {
270 $hi_code=hex($1);
271 }
272 # are we the first part of a surrogate?
273 if (!defined $hi_code || !($hi_code >= 0xD800 && $hi_code <= 0xDFFF)) {
274 # no, return the original
275 return "&#$hi;";
276 }
277 # check the second part - is that a surrogate part?
278 if ($lo =~ m/^0*(\d+)$/) {
279 $lo_code=$1;
280 }
281 elsif ($lo =~ m/^x([0-9A-F]+)$/i) {
282 $lo_code=hex($1);
283 }
284
285 if (!defined $lo_code || !($lo_code>= 0xD800 && $lo_code <= 0xDFFF)) {
286 # not part of a surrogate
287 return "&#$hi;";
288 }
289
290 my $char_equiv = undef;
291 my $codepoint = 0x10000 + ($hi_code - 0xD800) * 0x400 + ($lo_code - 0xDC00);
292 $char_equiv = &unicode::unicode2utf8([$codepoint]);
293
294
295 if (!defined $char_equiv) {
296 return "&#$hi;";
297 }
298 else {
299 if ((defined $and_decode) && ($and_decode)) {
300 $char_equiv = Encode::decode("utf8",$char_equiv);
301 }
302 $in_surrogate=1;
303 print STDERR "found surrogate\n";
304 return $char_equiv;
305 }
306}
307# returns the character as a raw utf-8 character. It assumes that the
308# & and ; have been stripped off the string.
309# If and_decode is true, it returns the codepoint instead of utf8
310# If keep_surrogates is true, leave the surrogate entities as is - for later processing with desurrogate.
311sub getcharequiv {
312 my ($entity, $convertsymbols, $and_decode, $keep_surrogates) = @_;
313
314 $keep_surrogates = 0 unless defined $keep_surrogates;
315 my $char_equiv = undef;
316
317 # a numeric entity
318 my $code = undef;
319 if ($entity =~ m/^\#0*(\d+)$/) {
320 $code=$1;
321 }
322 elsif ($entity =~ m/^\#x([0-9A-F]+)$/i) {
323 $code=hex($1);
324 }
325
326 if (defined $code) {
327
328 # UTF-16 surrogate pairs
329 if($code >= 0xD800 && $code <= 0xDFFF) {
330 print STDERR "Warning: encountered the HTML entity \&#$code; which represents part of a UTF-16 surrogate pair, which is not supported in ghtml::getcharequiv(). ";
331 if ($keep_surrogates) {
332 print STDERR "Leaving as entity\n";
333 return "&$entity;";
334 }
335 else {
336 print STDERR "Replacing with '?'.\n";
337 $code = ord("?");
338 }
339 }
340
341 # non-standard Microsoft breakage, as usual
342 if ($code < 0x9f) { # code page 1252 uses reserved bytes
343 if ($code == 0x91) {$code=0x2018} # 145 = single left quote
344 elsif ($code == 0x92) {$code=0x2019} # 146 = single right quote
345 elsif ($code == 0x93) {$code=0x201c} # 147 = double left quote
346 elsif ($code == 0x94) {$code=0x201d} # 148 = double right quote
347 # ...
348 }
349 $char_equiv = &unicode::unicode2utf8([$code]);
350 }
351
352 # a named character entity
353 elsif (defined $charnetosf{$entity}) {
354 $char_equiv = &unicode::unicode2utf8([$charnetosf{$entity}]);
355 }
356
357 # a named symbol entity
358 elsif ($convertsymbols && defined $symnetosf{$entity}) {
359 $char_equiv = &unicode::unicode2utf8([$symnetosf{$entity}]);
360 }
361
362 if (!defined $char_equiv) {
363 return "&$entity;"; # unknown character
364 }
365 else {
366 if ((defined $and_decode) && ($and_decode)) {
367 $char_equiv = Encode::decode("utf8",$char_equiv);
368 }
369 return $char_equiv;
370 }
371}
372
373# convert character entities from named equivalents to html font
374sub convertcharentities {
375 # args: the text that you want to convert
376
377 $_[0] =~ s/&([^;]+);/&getcharequiv($1,0)/gse;
378}
379
380# convert any entities from named equivalents to html font
381sub convertallentities {
382 # args: the text that you want to convert
383
384 $_[0] =~ s/&([^;]+);/&getcharequiv($1,1)/gse;
385}
386
387sub html2txt {
388 # args: the text that you want converted to ascii,
389 # and whether to strip out sgml tags
390
391 # strip out sgml tags if needed
392 $_[0] =~ s/<[^>]*>//g if $_[1];
393
394 # convert the char entities to the standard html font
395 &convertcharentities($_[0]);
396
397 # convert the html character set to a plain ascii character set
398 my $pos = 0;
399 while ($pos < length($_[0])) {
400 my $charnum = ord(substr($_[0], $pos, 1));
401 if ($charnum >= 32) { # only convert characters above #32
402 my $replacechars = " ";
403 $replacechars = $sftotxt{$charnum} if defined $sftotxt{$charnum};
404 substr($_[0], $pos, 1) = $replacechars;
405 $pos += length ($replacechars);
406
407 } else {
408 $pos ++;
409 }
410 }
411}
412
413
414# look for mime.types (eg in /etc, or apache/conf directories), or have a look
415# at <ftp://ftp.iana.org/in-notes/iana/assignments/media-types/> for defaults.
416sub guess_mime_type {
417 my ($filename) = @_;
418 # make the filename lowercase, since the mimetypes hashmap looks for lowercase
419 $filename = lc($filename);
420
421 my ($fileext) = $filename =~ /\.(\w+)$/;
422 return "unknown" unless defined $fileext;
423
424 # else
425 my $mimetype = $mime_type{$fileext};
426 return $mimetype if (defined $mimetype);
427
428 return "unknown";
429}
430
431
4321;
Note: See TracBrowser for help on using the repository browser.