root/main/trunk/greenstone2/perllib/ghtml.pm @ 30593

Revision 30593, 13.7 KB (checked in by ak19, 4 years ago)

Dr Bainbridge found another point in the code where the UTF-16 Surrogate pairs (that lead to malformed UTF-8 character errors) are encountered in HTMLPlugin. This part of the code is encountered when the PDFPlugin has the pdfbox_conversion set. PDFBox would have produced the HTML containing entities that represent characters not considered valid in UTF-8 and this then failed on Diego's test PDF until Dr Bainbridge's bugfix.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# ghtml.pm -- this used to be called html.pm but it clashed
4# with the existing html module under windows
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package ghtml;
29
30use strict;
31use unicode;
32
33# htmlsafe(TEXT)
34#
35# Converts SGML meta characters in TEXT to entity references.
36#
37sub htmlsafe
38{
39    $_[0] =~ s/&/&/osg;
40    $_[0] =~ s/</&lt;/osg;
41    $_[0] =~ s/>/&gt;/osg;
42    $_[0] =~ s/\"/&quot;/osg;
43}
44
45# unescape_html(TEXT)
46#
47# Converts HTML entities into their original form.
48#
49sub unescape_html
50{
51    my ($html) = @_;
52
53    $html =~ s/&amp;/&/osg;
54    $html =~ s/&lt;/</osg;
55    $html =~ s/&gt;/>/osg;
56    $html =~ s/&quot;/\"/osg;
57
58    return $html;
59}
60
61# urlsafe(TEXT)
62#
63# Converts characters not allowed in a URL to their hex representation.
64#
65sub urlsafe
66{
67    # protect any hash's that are part of an entity, e.g. &#097;
68    $_[0] =~ s/&#(.*?);/&%23$1;/g;
69
70    # and the usual suspects
71    $_[0] =~ s/[\x09\x20\x22\x3c\x3e\x5b\x5c\x5d\x5e\x60\x7b\x7c\x7d\x7e\?\=\&\+_\/]/sprintf("%%%2x", ord($&))/gse;
72}
73
74
75# named entry to the standard html font
76my %charnetosf = ("Agrave"=> "192",  "Aacute"=> "193",  "Acirc" => "194",  "Atilde"=> "195",
77           "Auml"  => "196",  "Aring" => "197",  "AElig" => "198",  "Ccedil"=> "199",
78           "Egrave"=> "200",  "Eacute"=> "201",  "Ecirc" => "202",  "Euml"  => "203",
79           "Igrave"=> "204",  "Iacute"=> "205",  "Icirc" => "206",  "Iuml"  => "207",
80           "ETH"   => "208",  "Ntilde"=> "209",  "Ograve"=> "210",  "Oacute"=> "211",
81           "Ocirc" => "212",  "Otilde"=> "213",  "Ouml"  => "214", 
82           "Oslash"=> "216",  "Ugrave"=> "217",  "Uacute"=> "218",  "Ucirc" => "219",
83           "Uuml"  => "220",  "Yacute"=> "221",  "THORN" => "222",  "szlig" => "223",
84           "agrave"=> "224",  "aacute"=> "225",  "acirc" => "226",  "atilde"=> "227",
85           "auml"  => "228",  "aring" => "229",  "aelig" => "230",  "ccedil"=> "231",
86           "egrave"=> "232",  "eacute"=> "233",  "ecirc" => "234",  "euml"  => "235",
87           "igrave"=> "236",  "iacute"=> "237",  "icirc" => "238",  "iuml"  => "239",
88           "eth"   => "240",  "ntilde"=> "241",  "ograve"=> "242",  "oacute"=> "243",
89           "ocirc" => "244",  "otilde"=> "245",  "ouml"  => "246", 
90           "oslash"=> "248",  "ugrave"=> "249",  "uacute"=> "250",  "ucirc" => "251",
91           "uuml"  => "252",  "yacute"=> "253",  "thorn" => "254",  "yuml"  => "255");
92
93my %symnetosf = ("quot"  => "34",   "amp"   => "38",   "lt"    => "60",   "gt"    => "62",
94          "nbsp"  => "160",  "iexcl" => "161",  "cent"  => "162",  "pound" => "163",
95          "curren"=> "164",  "yen"   => "165",  "brvbar"=> "166",  "sect"  => "167",
96          "uml"   => "168",  "copy"  => "169",  "ordf"  => "170",  "laquo" => "171",
97          "not"   => "172",  "shy"   => "173",  "reg"   => "174",  "macr"  => "175",
98          "deg"   => "176",  "plusmn"=> "177",  "sup2"  => "178",  "sup3"  => "179",
99          "acute" => "180",  "micro" => "181",  "para"  => "182",  "middot"=> "183",
100          "cedil" => "184",  "sup1"  => "185",  "ordm"  => "186",  "raquo" => "187",
101          "frac14"=> "188",  "frac12"=> "189",  "frac34"=> "190",  "iquest"=> "191",
102          "times" => "215",  "divide"=> "247");
103
104
105
106# standard font to plain text
107my %sftotxt = ("32"  => " ",  "33"  => "!",  "34"  => "\"",  "35"  => "\#",  "36"  => "\$",
108        "37"  => "\%", "38"  => "&",  "39"  => "'",   "40"  => "(",   "41"  => ")",
109        "42"  => "*",  "43"  => "+",  "44"  => ",",   "45"  => "-",   "46"  => ".",
110        "47"  => "/",  "48"  => "0",  "49"  => "1",   "50"  => "2",   "51"  => "3",
111        "52"  => "4",  "53"  => "5",  "54"  => "6",   "55"  => "7",   "56"  => "8",
112        "57"  => "9",  "58"  => ":",  "59"  => ";",   "60"  => "<",   "61"  => "=",
113        "62"  => ">",  "63"  => "?",  "64"  => "\@",  "65"  => "A",   "66"  => "B",
114        "57"  => "9",  "58"  => ":",  "59"  => ";",   "61"  => "=",
115        "63"  => "?",  "64"  => "\@", "65"  => "A",  "66"  => "B",
116        "67"  => "C",  "68"  => "D",  "69"  => "E",   "70"  => "F",   "71"  => "G",
117        "72"  => "H",  "73"  => "I",  "74"  => "J",   "75"  => "K",   "76"  => "L",
118        "77"  => "M",  "78"  => "N",  "79"  => "O",   "80"  => "P",   "81"  => "Q",
119        "82"  => "R",  "83"  => "S",  "84"  => "T",   "85"  => "U",   "86"  => "V",
120        "87"  => "W",  "88"  => "X",  "89"  => "Y",   "90"  => "Z",   "91"  => "[",
121        "92"  => "\\", "93"  => "]",  "94"  => "^",   "95"  => "_",   "96"  => "`",
122        "97"  => "a",  "98"  => "b",  "99"  => "c",   "100" => "d",   "101" => "e",
123        "102" => "f",  "103" => "g",  "104" => "h",   "105" => "i",   "106" => "j",
124        "107" => "k",  "108" => "l",  "109" => "m",   "110" => "n",   "111" => "o",
125        "112" => "p",  "113" => "q",  "114" => "r",   "115" => "s",   "116" => "t",
126        "117" => "u",  "118" => "v",  "119" => "w",   "120" => "x",   "121" => "y",
127        "122" => "z",  "123" => "{",  "124" => "|",   "125" => "}",   "126" => "~",
128        "130" => ",",  "131" => "f",  "132" => "\"",   "133" => "...", "139" => "<",
129        "140" => "OE", "145" => "'",  "146" => "'",   "147" => "\"",   "148" => "\"",
130        "149" => "o",  "150" => "--", "151" => "-",   "152" => "~",   "153" => "TM",
131        "155" => ">",  "156" => "oe", "159" => "Y",   "160" => " ",   "178" => "2",
132        "179" => "3",  "185" => "1",  "188" => "1/4", "189" => "1/2", "190" => "3/4",
133        "192" => "A",  "193" => "A",  "194" => "A",   "195" => "A",   "196" => "A",
134        "197" => "A",  "198" => "AE", "199" => "C",   "200" => "E",   "201" => "E",
135        "202" => "E",  "203" => "E",  "204" => "I",   "205" => "I",   "206" => "I",
136        "207" => "I",  "208" => "D",  "209" => "N",   "210" => "O",   "211" => "O",
137        "212" => "O",  "213" => "O",  "214" => "O",   "215" => "*",   "216" => "O",
138        "217" => "U",  "218" => "U",  "219" => "U",   "220" => "U",   "221" => "Y",
139        "223" => "ss", "224" => "a",  "225" => "a",   "226" => "a",   "227" => "a",
140        "228" => "a",  "229" => "a",  "230" => "ae",  "231" => "c",   "232" => "e",
141        "233" => "e",  "234" => "e",  "235" => "e",   "236" => "i",   "237" => "i",
142        "238" => "i",  "239" => "i",  "241" => "n",   "242" => "o",   "243" => "o",
143        "244" => "o",  "245" => "o",  "246" => "o",   "247" => "/",   "248" => "o",
144        "249" => "u",  "250" => "u",  "251" => "u",   "252" => "u",   "253" => "y",
145        "255" => "y",  "8218" => ",");
146
147
148my %mime_type = ("ai"=>"application/postscript", "aif"=>"audio/x-aiff",
149             "aifc"=>"audio/x-aiff", "aiff"=>"audio/x-aiff",
150             "au"=>"audio/basic", "avi"=>"video/x-msvideo",
151             "bcpio"=>"application/x-bcpio", "bin"=>"application/octet-stream",
152             "cdf"=>"application/x-netcdf", "class"=>"application/octet-stream",
153             "cpio"=>"application/x-cpio", "cpt"=>"application/mac-compactpro",
154             "csh"=>"application/x-csh", "dcr"=>"application/x-director",
155             "dir"=>"application/x-director", "dms"=>"application/octet-stream",
156             "doc"=>"application/msword", "dvi"=>"application/x-dvi",
157             "dxr"=>"application/x-director", "eps"=>"application/postscript",
158             "etx"=>"text/x-setext",
159             "exe"=>"application/octet-stream", "gif"=>"image/gif",
160             "gtar"=>"application/x-gtar", "hdf"=>"application/x-hdf",
161             "hqx"=>"application/mac-binhex40", "htm"=>"text/html",
162             "html"=>"text/html", "ice"=>"x-conference/x-cooltalk",
163             "ief"=>"image/ief", "jpe"=>"image/jpeg",
164             "jpeg"=>"image/jpeg", "jpg"=>"image/jpeg",
165             "kar"=>"audio/midi", "latex"=>"application/x-latex",
166             "lha"=>"application/octet-stream", "lzh"=>"application/octet-stream",
167             "man"=>"application/x-troff-man", "mcf"=>"image/vasa",
168             "me"=>"application/x-troff-me", "mid"=>"audio/midi",
169             "midi"=>"audio/midi", "mif"=>"application/x-mif",
170             "mov"=>"video/quicktime", "movie"=>"video/x-sgi-movie",
171             "mp2"=>"audio/mpeg", "mpe"=>"video/mpeg",
172             "mpeg"=>"video/mpeg", "mpg"=>"video/mpeg",
173             "mpga"=>"audio/mpeg", "ms"=>"application/x-troff-ms",
174             "nc"=>"application/x-netcdf", "oda"=>"application/oda",
175             "pbm"=>"image/x-portable-bitmap", "pdb"=>"chemical/x-pdb",
176             "pdf"=>"application/pdf", "pgm"=>"image/x-portable-graymap",
177             "png"=>"image/png", "pnm"=>"image/x-portable-anymap",
178             "ppm"=>"image/x-portable-pixmap",
179             "ppt"=>"application/vnd.ms-powerpoint",
180             "ps"=>"application/postscript", "qt"=>"video/quicktime",
181             "ra"=>"audio/x-realaudio", "ram"=>"audio/x-pn-realaudio",
182             "ras"=>"image/x-cmu-raster", "rgb"=>"image/x-rgb",
183             "roff"=>"application/x-troff", "rpm"=>"audio/x-pn-realaudio-plugin",
184             "rtf"=>"application/rtf", "rtx"=>"text/richtext",
185             "sgm"=>"text/x-sgml", "sgml"=>"text/x-sgml",
186             "sh"=>"application/x-sh", "shar"=>"application/x-shar",
187             "sit"=>"application/x-stuffit", "skd"=>"application/x-koan",
188             "skm"=>"application/x-koan", "skp"=>"application/x-koan",
189             "skt"=>"application/x-koan", "snd"=>"audio/basic",
190             "src"=>"application/x-wais-source", "sv4cpio"=>"application/x-sv4cpio",
191             "sv4crc"=>"application/x-sv4crc", "t"=>"application/x-troff",
192             "tar"=>"application/x-tar", "tcl"=>"application/x-tcl",
193             "tex"=>"application/x-tex", "texi"=>"application/x-texinfo",
194             "texinfo"=>"application/x-texinfo", "tif"=>"image/tiff",
195             "tiff"=>"image/tiff", "tr"=>"application/x-troff",
196             "tsv"=>"text/tab-separated-values", "txt"=>"text/plain",
197             "ustar"=>"application/x-ustar", "vcd"=>"application/x-cdlink",
198             "vrml"=>"x-world/x-vrml", "wav"=>"audio/x-wav",
199             "wrl"=>"x-world/x-vrml", "xbm"=>"image/x-xbitmap",
200             "xls"=>"application/vnd.ms-excel",
201             "xpm"=>"image/x-xpixmap", "xwd"=>"image/x-xwindowdump",
202             "xyz"=>"chemical/x-pdb", "zip"=>"application/zip");
203
204
205# returns the character as a raw utf-8 character. It assumes that the
206# & and ; have been stripped off the string.
207sub getcharequiv {
208    my ($entity, $convertsymbols, $and_decode) = @_;
209
210    my $char_equiv = undef;
211
212    # a numeric entity
213    my $code = undef;
214    if ($entity =~ m/^\#0*(\d+)$/) {
215    $code=$1;
216    }
217    elsif ($entity =~ m/^\#x([0-9A-F]+)$/i) {
218    $code=hex($1);
219    }
220
221
222    if (defined $code) {
223   
224    # malformed UTF-8 character used in UTF-16
225    if($code >= 0xD800 && $code <= 0xDFFF) {
226        print STDERR "Warning: encountered the HTML entity \&#$code; which represents part of a UTF-16 surrogate pair, which is not supported in ghtml::getcharequiv(). Replacing with '?'.\n";
227        $code = ord("?");
228    }
229
230    # non-standard Microsoft breakage, as usual
231    if ($code < 0x9f) { # code page 1252 uses reserved bytes
232        if ($code == 0x91) {$code=0x2018} # 145 = single left quote
233        elsif ($code == 0x92) {$code=0x2019} # 146 = single right quote
234        elsif ($code == 0x93) {$code=0x201c} # 147 = double left quote
235        elsif ($code == 0x94) {$code=0x201d} # 148 = double right quote
236        # ...
237    }   
238    $char_equiv = &unicode::unicode2utf8([$code]);
239    }
240   
241    # a named character entity
242    elsif (defined $charnetosf{$entity}) {
243    $char_equiv = &unicode::unicode2utf8([$charnetosf{$entity}]);
244    }
245
246    # a named symbol entity
247    elsif ($convertsymbols && defined $symnetosf{$entity}) {
248    $char_equiv = &unicode::unicode2utf8([$symnetosf{$entity}]);
249    }
250
251    if (!defined $char_equiv) {
252    return "&$entity;"; # unknown character
253    }
254    else {
255    if ((defined $and_decode) && ($and_decode)) {
256        $char_equiv = Encode::decode("utf8",$char_equiv);
257    }
258    return $char_equiv;
259    }
260}
261
262# convert character entities from named equivalents to html font
263sub convertcharentities {
264    # args: the text that you want to convert
265
266    $_[0] =~ s/&([^;]+);/&getcharequiv($1,0)/gse;
267}
268
269# convert any entities from named equivalents to html font
270sub convertallentities {
271    # args: the text that you want to convert
272
273    $_[0] =~ s/&([^;]+);/&getcharequiv($1,1)/gse;
274}
275
276sub html2txt {
277    # args: the text that you want converted to ascii,
278    # and whether to strip out sgml tags
279
280    # strip out sgml tags if needed
281    $_[0] =~ s/<[^>]*>//g if $_[1];
282
283    # convert the char entities to the standard html font
284    &convertcharentities($_[0]);
285   
286    # convert the html character set to a plain ascii character set
287    my $pos = 0;
288    while ($pos < length($_[0])) {
289    my $charnum = ord(substr($_[0], $pos, 1));
290    if ($charnum >= 32) { # only convert characters above #32
291        my $replacechars = " ";
292        $replacechars = $sftotxt{$charnum} if defined $sftotxt{$charnum};
293        substr($_[0], $pos, 1) = $replacechars;
294        $pos += length ($replacechars);
295
296    } else {
297        $pos ++;
298    }
299    }
300
301
302
303# look for mime.types (eg in /etc, or apache/conf directories), or have a look
304# at <ftp://ftp.iana.org/in-notes/iana/assignments/media-types/> for defaults.
305sub guess_mime_type {
306    my ($filename) = @_;
307    # make the filename lowercase, since the mimetypes hashmap looks for lowercase
308    $filename = lc($filename);
309
310    my ($fileext) = $filename =~ /\.(\w+)$/;
311    return "unknown" unless defined $fileext;
312
313    # else
314    my $mimetype =  $mime_type{$fileext};
315    return $mimetype if (defined $mimetype);
316
317    return "unknown";
318}
319
320
3211;
Note: See TracBrowser for help on using the browser.