[1010] | 1 | ###########################################################################
|
---|
| 2 | #
|
---|
| 3 | # ghtml.pm -- this used to be called html.pm but it clashed
|
---|
| 4 | # with the existing html module under windows
|
---|
| 5 | #
|
---|
| 6 | # A component of the Greenstone digital library software
|
---|
| 7 | # from the New Zealand Digital Library Project at the
|
---|
| 8 | # University of Waikato, New Zealand.
|
---|
| 9 | #
|
---|
| 10 | # Copyright (C) 1999 New Zealand Digital Library Project
|
---|
| 11 | #
|
---|
| 12 | # This program is free software; you can redistribute it and/or modify
|
---|
| 13 | # it under the terms of the GNU General Public License as published by
|
---|
| 14 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 15 | # (at your option) any later version.
|
---|
| 16 | #
|
---|
| 17 | # This program is distributed in the hope that it will be useful,
|
---|
| 18 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 19 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 20 | # GNU General Public License for more details.
|
---|
| 21 | #
|
---|
| 22 | # You should have received a copy of the GNU General Public License
|
---|
| 23 | # along with this program; if not, write to the Free Software
|
---|
| 24 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 25 | #
|
---|
| 26 | ###########################################################################
|
---|
| 27 |
|
---|
| 28 | package ghtml;
|
---|
| 29 |
|
---|
[15894] | 30 | use strict;
|
---|
[3181] | 31 | use unicode;
|
---|
| 32 |
|
---|
[1010] | 33 | # htmlsafe(TEXT)
|
---|
| 34 | #
|
---|
| 35 | # Converts SGML meta characters in TEXT to entity references.
|
---|
| 36 | #
|
---|
| 37 | sub htmlsafe
|
---|
| 38 | {
|
---|
[1222] | 39 | $_[0] =~ s/&/&/osg;
|
---|
| 40 | $_[0] =~ s/</</osg;
|
---|
| 41 | $_[0] =~ s/>/>/osg;
|
---|
| 42 | $_[0] =~ s/\"/"/osg;
|
---|
[1010] | 43 | }
|
---|
| 44 |
|
---|
[7903] | 45 | # unescape_html(TEXT)
|
---|
| 46 | #
|
---|
| 47 | # Converts HTML entities into their original form.
|
---|
| 48 | #
|
---|
| 49 | sub unescape_html
|
---|
| 50 | {
|
---|
| 51 | my ($html) = @_;
|
---|
| 52 |
|
---|
| 53 | $html =~ s/&/&/osg;
|
---|
| 54 | $html =~ s/</</osg;
|
---|
| 55 | $html =~ s/>/>/osg;
|
---|
| 56 | $html =~ s/"/\"/osg;
|
---|
| 57 |
|
---|
| 58 | return $html;
|
---|
| 59 | }
|
---|
| 60 |
|
---|
[1010] | 61 | # urlsafe(TEXT)
|
---|
| 62 | #
|
---|
| 63 | # Converts characters not allowed in a URL to their hex representation.
|
---|
| 64 | #
|
---|
| 65 | sub urlsafe
|
---|
| 66 | {
|
---|
[1222] | 67 | $_[0] =~ s/[\x09\x20\x22\x3c\x3e\x5b\x5c\x5d\x5e\x60\x7b\x7c\x7d\x7e\?\=\&\+_\/]/sprintf("%%%2x", ord($&))/gse;
|
---|
[1010] | 68 | }
|
---|
| 69 |
|
---|
| 70 |
|
---|
| 71 | # named entry to the standard html font
|
---|
[8716] | 72 | my %charnetosf = ("Agrave"=> "192", "Aacute"=> "193", "Acirc" => "194", "Atilde"=> "195",
|
---|
[1010] | 73 | "Auml" => "196", "Aring" => "197", "AElig" => "198", "Ccedil"=> "199",
|
---|
| 74 | "Egrave"=> "200", "Eacute"=> "201", "Ecirc" => "202", "Euml" => "203",
|
---|
| 75 | "Igrave"=> "204", "Iacute"=> "205", "Icirc" => "206", "Iuml" => "207",
|
---|
| 76 | "ETH" => "208", "Ntilde"=> "209", "Ograve"=> "210", "Oacute"=> "211",
|
---|
| 77 | "Ocirc" => "212", "Otilde"=> "213", "Ouml" => "214",
|
---|
| 78 | "Oslash"=> "216", "Ugrave"=> "217", "Uacute"=> "218", "Ucirc" => "219",
|
---|
| 79 | "Uuml" => "220", "Yacute"=> "221", "THORN" => "222", "szlig" => "223",
|
---|
| 80 | "agrave"=> "224", "aacute"=> "225", "acirc" => "226", "atilde"=> "227",
|
---|
| 81 | "auml" => "228", "aring" => "229", "aelig" => "230", "ccedil"=> "231",
|
---|
| 82 | "egrave"=> "232", "eacute"=> "233", "ecirc" => "234", "euml" => "235",
|
---|
| 83 | "igrave"=> "236", "iacute"=> "237", "icirc" => "238", "iuml" => "239",
|
---|
[7518] | 84 | "eth" => "240", "ntilde"=> "241", "ograve"=> "242", "oacute"=> "243",
|
---|
[1010] | 85 | "ocirc" => "244", "otilde"=> "245", "ouml" => "246",
|
---|
| 86 | "oslash"=> "248", "ugrave"=> "249", "uacute"=> "250", "ucirc" => "251",
|
---|
| 87 | "uuml" => "252", "yacute"=> "253", "thorn" => "254", "yuml" => "255");
|
---|
| 88 |
|
---|
[8716] | 89 | my %symnetosf = ("quot" => "34", "amp" => "38", "lt" => "60", "gt" => "62",
|
---|
[1010] | 90 | "nbsp" => "160", "iexcl" => "161", "cent" => "162", "pound" => "163",
|
---|
| 91 | "curren"=> "164", "yen" => "165", "brvbar"=> "166", "sect" => "167",
|
---|
| 92 | "uml" => "168", "copy" => "169", "ordf" => "170", "laquo" => "171",
|
---|
| 93 | "not" => "172", "shy" => "173", "reg" => "174", "macr" => "175",
|
---|
| 94 | "deg" => "176", "plusmn"=> "177", "sup2" => "178", "sup3" => "179",
|
---|
| 95 | "acute" => "180", "micro" => "181", "para" => "182", "middot"=> "183",
|
---|
| 96 | "cedil" => "184", "sup1" => "185", "ordm" => "186", "raquo" => "187",
|
---|
| 97 | "frac14"=> "188", "frac12"=> "189", "frac34"=> "190", "iquest"=> "191",
|
---|
| 98 | "times" => "215", "divide"=> "247");
|
---|
| 99 |
|
---|
| 100 |
|
---|
| 101 |
|
---|
| 102 | # standard font to plain text
|
---|
[8716] | 103 | my %sftotxt = ("32" => " ", "33" => "!", "34" => "\"", "35" => "\#", "36" => "\$",
|
---|
[1010] | 104 | "37" => "\%", "38" => "&", "39" => "'", "40" => "(", "41" => ")",
|
---|
| 105 | "42" => "*", "43" => "+", "44" => ",", "45" => "-", "46" => ".",
|
---|
| 106 | "47" => "/", "48" => "0", "49" => "1", "50" => "2", "51" => "3",
|
---|
| 107 | "52" => "4", "53" => "5", "54" => "6", "55" => "7", "56" => "8",
|
---|
| 108 | "57" => "9", "58" => ":", "59" => ";", "60" => "<", "61" => "=",
|
---|
| 109 | "62" => ">", "63" => "?", "64" => "\@", "65" => "A", "66" => "B",
|
---|
| 110 | "57" => "9", "58" => ":", "59" => ";", "61" => "=",
|
---|
| 111 | "63" => "?", "64" => "\@", "65" => "A", "66" => "B",
|
---|
| 112 | "67" => "C", "68" => "D", "69" => "E", "70" => "F", "71" => "G",
|
---|
| 113 | "72" => "H", "73" => "I", "74" => "J", "75" => "K", "76" => "L",
|
---|
| 114 | "77" => "M", "78" => "N", "79" => "O", "80" => "P", "81" => "Q",
|
---|
| 115 | "82" => "R", "83" => "S", "84" => "T", "85" => "U", "86" => "V",
|
---|
| 116 | "87" => "W", "88" => "X", "89" => "Y", "90" => "Z", "91" => "[",
|
---|
| 117 | "92" => "\\", "93" => "]", "94" => "^", "95" => "_", "96" => "`",
|
---|
| 118 | "97" => "a", "98" => "b", "99" => "c", "100" => "d", "101" => "e",
|
---|
| 119 | "102" => "f", "103" => "g", "104" => "h", "105" => "i", "106" => "j",
|
---|
| 120 | "107" => "k", "108" => "l", "109" => "m", "110" => "n", "111" => "o",
|
---|
| 121 | "112" => "p", "113" => "q", "114" => "r", "115" => "s", "116" => "t",
|
---|
| 122 | "117" => "u", "118" => "v", "119" => "w", "120" => "x", "121" => "y",
|
---|
| 123 | "122" => "z", "123" => "{", "124" => "|", "125" => "}", "126" => "~",
|
---|
| 124 | "130" => ",", "131" => "f", "132" => "\"", "133" => "...", "139" => "<",
|
---|
| 125 | "140" => "OE", "145" => "'", "146" => "'", "147" => "\"", "148" => "\"",
|
---|
| 126 | "149" => "o", "150" => "--", "151" => "-", "152" => "~", "153" => "TM",
|
---|
| 127 | "155" => ">", "156" => "oe", "159" => "Y", "160" => " ", "178" => "2",
|
---|
| 128 | "179" => "3", "185" => "1", "188" => "1/4", "189" => "1/2", "190" => "3/4",
|
---|
| 129 | "192" => "A", "193" => "A", "194" => "A", "195" => "A", "196" => "A",
|
---|
| 130 | "197" => "A", "198" => "AE", "199" => "C", "200" => "E", "201" => "E",
|
---|
| 131 | "202" => "E", "203" => "E", "204" => "I", "205" => "I", "206" => "I",
|
---|
| 132 | "207" => "I", "208" => "D", "209" => "N", "210" => "O", "211" => "O",
|
---|
| 133 | "212" => "O", "213" => "O", "214" => "O", "215" => "*", "216" => "O",
|
---|
| 134 | "217" => "U", "218" => "U", "219" => "U", "220" => "U", "221" => "Y",
|
---|
| 135 | "223" => "ss", "224" => "a", "225" => "a", "226" => "a", "227" => "a",
|
---|
| 136 | "228" => "a", "229" => "a", "230" => "ae", "231" => "c", "232" => "e",
|
---|
| 137 | "233" => "e", "234" => "e", "235" => "e", "236" => "i", "237" => "i",
|
---|
| 138 | "238" => "i", "239" => "i", "241" => "n", "242" => "o", "243" => "o",
|
---|
| 139 | "244" => "o", "245" => "o", "246" => "o", "247" => "/", "248" => "o",
|
---|
| 140 | "249" => "u", "250" => "u", "251" => "u", "252" => "u", "253" => "y",
|
---|
| 141 | "255" => "y", "8218" => ",");
|
---|
| 142 |
|
---|
| 143 |
|
---|
[17058] | 144 | my %mime_type = ("ai"=>"application/postscript", "aif"=>"audio/x-aiff",
|
---|
| 145 | "aifc"=>"audio/x-aiff", "aiff"=>"audio/x-aiff",
|
---|
| 146 | "au"=>"audio/basic", "avi"=>"video/x-msvideo",
|
---|
| 147 | "bcpio"=>"application/x-bcpio", "bin"=>"application/octet-stream",
|
---|
| 148 | "cdf"=>"application/x-netcdf", "class"=>"application/octet-stream",
|
---|
| 149 | "cpio"=>"application/x-cpio", "cpt"=>"application/mac-compactpro",
|
---|
| 150 | "csh"=>"application/x-csh", "dcr"=>"application/x-director",
|
---|
| 151 | "dir"=>"application/x-director", "dms"=>"application/octet-stream",
|
---|
| 152 | "doc"=>"application/msword", "dvi"=>"application/x-dvi",
|
---|
| 153 | "dxr"=>"application/x-director", "eps"=>"application/postscript",
|
---|
| 154 | "etx"=>"text/x-setext",
|
---|
| 155 | "exe"=>"application/octet-stream", "gif"=>"image/gif",
|
---|
| 156 | "gtar"=>"application/x-gtar", "hdf"=>"application/x-hdf",
|
---|
| 157 | "hqx"=>"application/mac-binhex40", "htm"=>"text/html",
|
---|
| 158 | "html"=>"text/html", "ice"=>"x-conference/x-cooltalk",
|
---|
| 159 | "ief"=>"image/ief", "jpe"=>"image/jpeg",
|
---|
| 160 | "jpeg"=>"image/jpeg", "jpg"=>"image/jpeg",
|
---|
| 161 | "kar"=>"audio/midi", "latex"=>"application/x-latex",
|
---|
| 162 | "lha"=>"application/octet-stream", "lzh"=>"application/octet-stream",
|
---|
| 163 | "man"=>"application/x-troff-man", "mcf"=>"image/vasa",
|
---|
| 164 | "me"=>"application/x-troff-me", "mid"=>"audio/midi",
|
---|
| 165 | "midi"=>"audio/midi", "mif"=>"application/x-mif",
|
---|
| 166 | "mov"=>"video/quicktime", "movie"=>"video/x-sgi-movie",
|
---|
| 167 | "mp2"=>"audio/mpeg", "mpe"=>"video/mpeg",
|
---|
| 168 | "mpeg"=>"video/mpeg", "mpg"=>"video/mpeg",
|
---|
| 169 | "mpga"=>"audio/mpeg", "ms"=>"application/x-troff-ms",
|
---|
| 170 | "nc"=>"application/x-netcdf", "oda"=>"application/oda",
|
---|
| 171 | "pbm"=>"image/x-portable-bitmap", "pdb"=>"chemical/x-pdb",
|
---|
| 172 | "pdf"=>"application/pdf", "pgm"=>"image/x-portable-graymap",
|
---|
| 173 | "png"=>"image/png", "pnm"=>"image/x-portable-anymap",
|
---|
| 174 | "ppm"=>"image/x-portable-pixmap",
|
---|
| 175 | "ppt"=>"application/vnd.ms-powerpoint",
|
---|
| 176 | "ps"=>"application/postscript", "qt"=>"video/quicktime",
|
---|
| 177 | "ra"=>"audio/x-realaudio", "ram"=>"audio/x-pn-realaudio",
|
---|
| 178 | "ras"=>"image/x-cmu-raster", "rgb"=>"image/x-rgb",
|
---|
| 179 | "roff"=>"application/x-troff", "rpm"=>"audio/x-pn-realaudio-plugin",
|
---|
| 180 | "rtf"=>"application/rtf", "rtx"=>"text/richtext",
|
---|
| 181 | "sgm"=>"text/x-sgml", "sgml"=>"text/x-sgml",
|
---|
| 182 | "sh"=>"application/x-sh", "shar"=>"application/x-shar",
|
---|
| 183 | "sit"=>"application/x-stuffit", "skd"=>"application/x-koan",
|
---|
| 184 | "skm"=>"application/x-koan", "skp"=>"application/x-koan",
|
---|
| 185 | "skt"=>"application/x-koan", "snd"=>"audio/basic",
|
---|
| 186 | "src"=>"application/x-wais-source", "sv4cpio"=>"application/x-sv4cpio",
|
---|
| 187 | "sv4crc"=>"application/x-sv4crc", "t"=>"application/x-troff",
|
---|
| 188 | "tar"=>"application/x-tar", "tcl"=>"application/x-tcl",
|
---|
| 189 | "tex"=>"application/x-tex", "texi"=>"application/x-texinfo",
|
---|
| 190 | "texinfo"=>"application/x-texinfo", "tif"=>"image/tiff",
|
---|
| 191 | "tiff"=>"image/tiff", "tr"=>"application/x-troff",
|
---|
| 192 | "tsv"=>"text/tab-separated-values", "txt"=>"text/plain",
|
---|
| 193 | "ustar"=>"application/x-ustar", "vcd"=>"application/x-cdlink",
|
---|
| 194 | "vrml"=>"x-world/x-vrml", "wav"=>"audio/x-wav",
|
---|
| 195 | "wrl"=>"x-world/x-vrml", "xbm"=>"image/x-xbitmap",
|
---|
| 196 | "xls"=>"application/vnd.ms-excel",
|
---|
| 197 | "xpm"=>"image/x-xpixmap", "xwd"=>"image/x-xwindowdump",
|
---|
| 198 | "xyz"=>"chemical/x-pdb", "zip"=>"application/zip");
|
---|
| 199 |
|
---|
| 200 |
|
---|
[3181] | 201 | # returns the character as a raw utf-8 character. It assumes that the
|
---|
[1010] | 202 | # & and ; have been stripped off the string.
|
---|
| 203 | sub getcharequiv {
|
---|
[22952] | 204 | my ($entity, $convertsymbols, $and_decode) = @_;
|
---|
[1010] | 205 |
|
---|
[22952] | 206 | my $char_equiv = undef;
|
---|
| 207 |
|
---|
[1010] | 208 | # a numeric entity
|
---|
[23362] | 209 | my $code = undef;
|
---|
| 210 | if ($entity =~ m/^\#0*(\d+)$/) {
|
---|
| 211 | $code=$1;
|
---|
| 212 | }
|
---|
| 213 | elsif ($entity =~ m/^\#x([0-9A-F]+)$/i) {
|
---|
| 214 | $code=hex($1);
|
---|
| 215 | }
|
---|
| 216 |
|
---|
| 217 | if (defined $code) {
|
---|
[5709] | 218 | # non-standard Microsoft breakage, as usual
|
---|
| 219 | if ($code < 0x9f) { # code page 1252 uses reserved bytes
|
---|
| 220 | if ($code == 0x91) {$code=0x2018} # 145 = single left quote
|
---|
| 221 | elsif ($code == 0x92) {$code=0x2019} # 146 = single right quote
|
---|
| 222 | elsif ($code == 0x93) {$code=0x201c} # 147 = double left quote
|
---|
| 223 | elsif ($code == 0x94) {$code=0x201d} # 148 = double right quote
|
---|
| 224 | # ...
|
---|
[22952] | 225 | }
|
---|
| 226 | $char_equiv = &unicode::unicode2utf8([$code]);
|
---|
[1010] | 227 | }
|
---|
| 228 |
|
---|
| 229 | # a named character entity
|
---|
[22952] | 230 | elsif (defined $charnetosf{$entity}) {
|
---|
| 231 | $char_equiv = &unicode::unicode2utf8([$charnetosf{$entity}]);
|
---|
[1010] | 232 | }
|
---|
| 233 |
|
---|
| 234 | # a named symbol entity
|
---|
[22952] | 235 | elsif ($convertsymbols && defined $symnetosf{$entity}) {
|
---|
| 236 | $char_equiv = &unicode::unicode2utf8([$symnetosf{$entity}]);
|
---|
[1010] | 237 | }
|
---|
| 238 |
|
---|
[22952] | 239 | if (!defined $char_equiv) {
|
---|
| 240 | return "&$entity;"; # unknown character
|
---|
| 241 | }
|
---|
| 242 | else {
|
---|
| 243 | if ((defined $and_decode) && ($and_decode)) {
|
---|
| 244 | $char_equiv = Encode::decode("utf8",$char_equiv);
|
---|
| 245 | }
|
---|
| 246 | return $char_equiv;
|
---|
| 247 | }
|
---|
[1010] | 248 | }
|
---|
| 249 |
|
---|
| 250 | # convert character entities from named equivalents to html font
|
---|
| 251 | sub convertcharentities {
|
---|
| 252 | # args: the text that you want to convert
|
---|
| 253 |
|
---|
[1222] | 254 | $_[0] =~ s/&([^;]+);/&getcharequiv($1,0)/gse;
|
---|
[1010] | 255 | }
|
---|
| 256 |
|
---|
| 257 | # convert any entities from named equivalents to html font
|
---|
| 258 | sub convertallentities {
|
---|
| 259 | # args: the text that you want to convert
|
---|
| 260 |
|
---|
[1222] | 261 | $_[0] =~ s/&([^;]+);/&getcharequiv($1,1)/gse;
|
---|
[1010] | 262 | }
|
---|
| 263 |
|
---|
| 264 | sub html2txt {
|
---|
| 265 | # args: the text that you want converted to ascii,
|
---|
| 266 | # and whether to strip out sgml tags
|
---|
| 267 |
|
---|
| 268 | # strip out sgml tags if needed
|
---|
[8886] | 269 | $_[0] =~ s/<[^>]*>//g if $_[1];
|
---|
[1010] | 270 |
|
---|
| 271 | # convert the char entities to the standard html font
|
---|
| 272 | &convertcharentities($_[0]);
|
---|
| 273 |
|
---|
| 274 | # convert the html character set to a plain ascii character set
|
---|
| 275 | my $pos = 0;
|
---|
| 276 | while ($pos < length($_[0])) {
|
---|
| 277 | my $charnum = ord(substr($_[0], $pos, 1));
|
---|
| 278 | if ($charnum >= 32) { # only convert characters above #32
|
---|
| 279 | my $replacechars = " ";
|
---|
| 280 | $replacechars = $sftotxt{$charnum} if defined $sftotxt{$charnum};
|
---|
| 281 | substr($_[0], $pos, 1) = $replacechars;
|
---|
| 282 | $pos += length ($replacechars);
|
---|
| 283 |
|
---|
| 284 | } else {
|
---|
| 285 | $pos ++;
|
---|
| 286 | }
|
---|
| 287 | }
|
---|
| 288 | }
|
---|
| 289 |
|
---|
| 290 |
|
---|
[2994] | 291 | # look for mime.types (eg in /etc, or apache/conf directories), or have a look
|
---|
| 292 | # at <ftp://ftp.iana.org/in-notes/iana/assignments/media-types/> for defaults.
|
---|
[1010] | 293 | sub guess_mime_type {
|
---|
| 294 | my ($filename) = @_;
|
---|
[17058] | 295 | # make the filename lowercase, since the mimetypes hashmap looks for lowercase
|
---|
| 296 | $filename = lc($filename);
|
---|
[1010] | 297 |
|
---|
| 298 | my ($fileext) = $filename =~ /\.(\w+)$/;
|
---|
| 299 | return "unknown" unless defined $fileext;
|
---|
| 300 |
|
---|
[17058] | 301 | # else
|
---|
| 302 | my $mimetype = $mime_type{$fileext};
|
---|
| 303 | return $mimetype if (defined $mimetype);
|
---|
[1010] | 304 |
|
---|
| 305 | return "unknown";
|
---|
| 306 | }
|
---|
| 307 |
|
---|
| 308 |
|
---|
| 309 | 1;
|
---|