source: gsdl/trunk/perllib/ghtml.pm@ 17110

Last change on this file since 17110 was 17058, checked in by ak19, 16 years ago
  1. Moved the mime_type hashmap out of the guess_mime_type subroutine since this hashmap might get recreated everytime that subroutine is called. 2. Converted the filename from which the mimetype is guessed into lowercase first since the mime_type hashmap only contains lowercase file extension.
  • Property svn:keywords set to Author Date Id Revision
File size: 13.6 KB
Line 
1###########################################################################
2#
3# ghtml.pm -- this used to be called html.pm but it clashed
4# with the existing html module under windows
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package ghtml;
29
30use strict;
31use unicode;
32
33# htmlsafe(TEXT)
34#
35# Converts SGML meta characters in TEXT to entity references.
36#
37sub htmlsafe
38{
39 $_[0] =~ s/&/&/osg;
40 $_[0] =~ s/</&lt;/osg;
41 $_[0] =~ s/>/&gt;/osg;
42 $_[0] =~ s/\"/&quot;/osg;
43}
44
45# unescape_html(TEXT)
46#
47# Converts HTML entities into their original form.
48#
49sub unescape_html
50{
51 my ($html) = @_;
52
53 $html =~ s/&amp;/&/osg;
54 $html =~ s/&lt;/</osg;
55 $html =~ s/&gt;/>/osg;
56 $html =~ s/&quot;/\"/osg;
57
58 return $html;
59}
60
61# urlsafe(TEXT)
62#
63# Converts characters not allowed in a URL to their hex representation.
64#
65sub urlsafe
66{
67 $_[0] =~ s/[\x09\x20\x22\x3c\x3e\x5b\x5c\x5d\x5e\x60\x7b\x7c\x7d\x7e\?\=\&\+_\/]/sprintf("%%%2x", ord($&))/gse;
68}
69
70# dmsafe
71#
72# Converts characters which could make display.pl or html crack to their entity references.
73# Don't use this on dm macros as they will no longer be recognisable by dm (which
74# by the way, is the whole idea).
75# -- Rodger 12/1/98
76sub dmsafe {
77 my ($s) = $_[0];
78 $s =~ s/&/&amp;/osg; # for html
79 $s =~ s/</&lt;/osg; # for html
80 $s =~ s/>/&gt;/osg; # for html
81 $s =~ s/\_/&#095;/osg; # for dm (we have a convention of starting macros with _
82 $s =~ s/\"/&quot;/osg; # for html (don't want to be interpreted as a quote)
83 $s =~ s/\{/&#123;/osg; # for dm blocks
84 $s =~ s/\}/&#125;/osg; # for dm blocks
85 $s =~ s/\\/&#092;/osg; # for dm (dm removes naturally occurring backquotes)
86 return $s;
87}
88
89
90# named entry to the standard html font
91my %charnetosf = ("Agrave"=> "192", "Aacute"=> "193", "Acirc" => "194", "Atilde"=> "195",
92 "Auml" => "196", "Aring" => "197", "AElig" => "198", "Ccedil"=> "199",
93 "Egrave"=> "200", "Eacute"=> "201", "Ecirc" => "202", "Euml" => "203",
94 "Igrave"=> "204", "Iacute"=> "205", "Icirc" => "206", "Iuml" => "207",
95 "ETH" => "208", "Ntilde"=> "209", "Ograve"=> "210", "Oacute"=> "211",
96 "Ocirc" => "212", "Otilde"=> "213", "Ouml" => "214",
97 "Oslash"=> "216", "Ugrave"=> "217", "Uacute"=> "218", "Ucirc" => "219",
98 "Uuml" => "220", "Yacute"=> "221", "THORN" => "222", "szlig" => "223",
99 "agrave"=> "224", "aacute"=> "225", "acirc" => "226", "atilde"=> "227",
100 "auml" => "228", "aring" => "229", "aelig" => "230", "ccedil"=> "231",
101 "egrave"=> "232", "eacute"=> "233", "ecirc" => "234", "euml" => "235",
102 "igrave"=> "236", "iacute"=> "237", "icirc" => "238", "iuml" => "239",
103 "eth" => "240", "ntilde"=> "241", "ograve"=> "242", "oacute"=> "243",
104 "ocirc" => "244", "otilde"=> "245", "ouml" => "246",
105 "oslash"=> "248", "ugrave"=> "249", "uacute"=> "250", "ucirc" => "251",
106 "uuml" => "252", "yacute"=> "253", "thorn" => "254", "yuml" => "255");
107
108my %symnetosf = ("quot" => "34", "amp" => "38", "lt" => "60", "gt" => "62",
109 "nbsp" => "160", "iexcl" => "161", "cent" => "162", "pound" => "163",
110 "curren"=> "164", "yen" => "165", "brvbar"=> "166", "sect" => "167",
111 "uml" => "168", "copy" => "169", "ordf" => "170", "laquo" => "171",
112 "not" => "172", "shy" => "173", "reg" => "174", "macr" => "175",
113 "deg" => "176", "plusmn"=> "177", "sup2" => "178", "sup3" => "179",
114 "acute" => "180", "micro" => "181", "para" => "182", "middot"=> "183",
115 "cedil" => "184", "sup1" => "185", "ordm" => "186", "raquo" => "187",
116 "frac14"=> "188", "frac12"=> "189", "frac34"=> "190", "iquest"=> "191",
117 "times" => "215", "divide"=> "247");
118
119
120
121# standard font to plain text
122my %sftotxt = ("32" => " ", "33" => "!", "34" => "\"", "35" => "\#", "36" => "\$",
123 "37" => "\%", "38" => "&", "39" => "'", "40" => "(", "41" => ")",
124 "42" => "*", "43" => "+", "44" => ",", "45" => "-", "46" => ".",
125 "47" => "/", "48" => "0", "49" => "1", "50" => "2", "51" => "3",
126 "52" => "4", "53" => "5", "54" => "6", "55" => "7", "56" => "8",
127 "57" => "9", "58" => ":", "59" => ";", "60" => "<", "61" => "=",
128 "62" => ">", "63" => "?", "64" => "\@", "65" => "A", "66" => "B",
129 "57" => "9", "58" => ":", "59" => ";", "61" => "=",
130 "63" => "?", "64" => "\@", "65" => "A", "66" => "B",
131 "67" => "C", "68" => "D", "69" => "E", "70" => "F", "71" => "G",
132 "72" => "H", "73" => "I", "74" => "J", "75" => "K", "76" => "L",
133 "77" => "M", "78" => "N", "79" => "O", "80" => "P", "81" => "Q",
134 "82" => "R", "83" => "S", "84" => "T", "85" => "U", "86" => "V",
135 "87" => "W", "88" => "X", "89" => "Y", "90" => "Z", "91" => "[",
136 "92" => "\\", "93" => "]", "94" => "^", "95" => "_", "96" => "`",
137 "97" => "a", "98" => "b", "99" => "c", "100" => "d", "101" => "e",
138 "102" => "f", "103" => "g", "104" => "h", "105" => "i", "106" => "j",
139 "107" => "k", "108" => "l", "109" => "m", "110" => "n", "111" => "o",
140 "112" => "p", "113" => "q", "114" => "r", "115" => "s", "116" => "t",
141 "117" => "u", "118" => "v", "119" => "w", "120" => "x", "121" => "y",
142 "122" => "z", "123" => "{", "124" => "|", "125" => "}", "126" => "~",
143 "130" => ",", "131" => "f", "132" => "\"", "133" => "...", "139" => "<",
144 "140" => "OE", "145" => "'", "146" => "'", "147" => "\"", "148" => "\"",
145 "149" => "o", "150" => "--", "151" => "-", "152" => "~", "153" => "TM",
146 "155" => ">", "156" => "oe", "159" => "Y", "160" => " ", "178" => "2",
147 "179" => "3", "185" => "1", "188" => "1/4", "189" => "1/2", "190" => "3/4",
148 "192" => "A", "193" => "A", "194" => "A", "195" => "A", "196" => "A",
149 "197" => "A", "198" => "AE", "199" => "C", "200" => "E", "201" => "E",
150 "202" => "E", "203" => "E", "204" => "I", "205" => "I", "206" => "I",
151 "207" => "I", "208" => "D", "209" => "N", "210" => "O", "211" => "O",
152 "212" => "O", "213" => "O", "214" => "O", "215" => "*", "216" => "O",
153 "217" => "U", "218" => "U", "219" => "U", "220" => "U", "221" => "Y",
154 "223" => "ss", "224" => "a", "225" => "a", "226" => "a", "227" => "a",
155 "228" => "a", "229" => "a", "230" => "ae", "231" => "c", "232" => "e",
156 "233" => "e", "234" => "e", "235" => "e", "236" => "i", "237" => "i",
157 "238" => "i", "239" => "i", "241" => "n", "242" => "o", "243" => "o",
158 "244" => "o", "245" => "o", "246" => "o", "247" => "/", "248" => "o",
159 "249" => "u", "250" => "u", "251" => "u", "252" => "u", "253" => "y",
160 "255" => "y", "8218" => ",");
161
162
163my %mime_type = ("ai"=>"application/postscript", "aif"=>"audio/x-aiff",
164 "aifc"=>"audio/x-aiff", "aiff"=>"audio/x-aiff",
165 "au"=>"audio/basic", "avi"=>"video/x-msvideo",
166 "bcpio"=>"application/x-bcpio", "bin"=>"application/octet-stream",
167 "cdf"=>"application/x-netcdf", "class"=>"application/octet-stream",
168 "cpio"=>"application/x-cpio", "cpt"=>"application/mac-compactpro",
169 "csh"=>"application/x-csh", "dcr"=>"application/x-director",
170 "dir"=>"application/x-director", "dms"=>"application/octet-stream",
171 "doc"=>"application/msword", "dvi"=>"application/x-dvi",
172 "dxr"=>"application/x-director", "eps"=>"application/postscript",
173 "etx"=>"text/x-setext",
174 "exe"=>"application/octet-stream", "gif"=>"image/gif",
175 "gtar"=>"application/x-gtar", "hdf"=>"application/x-hdf",
176 "hqx"=>"application/mac-binhex40", "htm"=>"text/html",
177 "html"=>"text/html", "ice"=>"x-conference/x-cooltalk",
178 "ief"=>"image/ief", "jpe"=>"image/jpeg",
179 "jpeg"=>"image/jpeg", "jpg"=>"image/jpeg",
180 "kar"=>"audio/midi", "latex"=>"application/x-latex",
181 "lha"=>"application/octet-stream", "lzh"=>"application/octet-stream",
182 "man"=>"application/x-troff-man", "mcf"=>"image/vasa",
183 "me"=>"application/x-troff-me", "mid"=>"audio/midi",
184 "midi"=>"audio/midi", "mif"=>"application/x-mif",
185 "mov"=>"video/quicktime", "movie"=>"video/x-sgi-movie",
186 "mp2"=>"audio/mpeg", "mpe"=>"video/mpeg",
187 "mpeg"=>"video/mpeg", "mpg"=>"video/mpeg",
188 "mpga"=>"audio/mpeg", "ms"=>"application/x-troff-ms",
189 "nc"=>"application/x-netcdf", "oda"=>"application/oda",
190 "pbm"=>"image/x-portable-bitmap", "pdb"=>"chemical/x-pdb",
191 "pdf"=>"application/pdf", "pgm"=>"image/x-portable-graymap",
192 "png"=>"image/png", "pnm"=>"image/x-portable-anymap",
193 "ppm"=>"image/x-portable-pixmap",
194 "ppt"=>"application/vnd.ms-powerpoint",
195 "ps"=>"application/postscript", "qt"=>"video/quicktime",
196 "ra"=>"audio/x-realaudio", "ram"=>"audio/x-pn-realaudio",
197 "ras"=>"image/x-cmu-raster", "rgb"=>"image/x-rgb",
198 "roff"=>"application/x-troff", "rpm"=>"audio/x-pn-realaudio-plugin",
199 "rtf"=>"application/rtf", "rtx"=>"text/richtext",
200 "sgm"=>"text/x-sgml", "sgml"=>"text/x-sgml",
201 "sh"=>"application/x-sh", "shar"=>"application/x-shar",
202 "sit"=>"application/x-stuffit", "skd"=>"application/x-koan",
203 "skm"=>"application/x-koan", "skp"=>"application/x-koan",
204 "skt"=>"application/x-koan", "snd"=>"audio/basic",
205 "src"=>"application/x-wais-source", "sv4cpio"=>"application/x-sv4cpio",
206 "sv4crc"=>"application/x-sv4crc", "t"=>"application/x-troff",
207 "tar"=>"application/x-tar", "tcl"=>"application/x-tcl",
208 "tex"=>"application/x-tex", "texi"=>"application/x-texinfo",
209 "texinfo"=>"application/x-texinfo", "tif"=>"image/tiff",
210 "tiff"=>"image/tiff", "tr"=>"application/x-troff",
211 "tsv"=>"text/tab-separated-values", "txt"=>"text/plain",
212 "ustar"=>"application/x-ustar", "vcd"=>"application/x-cdlink",
213 "vrml"=>"x-world/x-vrml", "wav"=>"audio/x-wav",
214 "wrl"=>"x-world/x-vrml", "xbm"=>"image/x-xbitmap",
215 "xls"=>"application/vnd.ms-excel",
216 "xpm"=>"image/x-xpixmap", "xwd"=>"image/x-xwindowdump",
217 "xyz"=>"chemical/x-pdb", "zip"=>"application/zip");
218
219
220# returns the character as a raw utf-8 character. It assumes that the
221# & and ; have been stripped off the string.
222sub getcharequiv {
223 my ($entity, $convertsymbols) = @_;
224
225 # a numeric entity
226 if ($entity =~ /^\#0*(\d+)/) {
227 my $code=$1;
228 # non-standard Microsoft breakage, as usual
229 if ($code < 0x9f) { # code page 1252 uses reserved bytes
230 if ($code == 0x91) {$code=0x2018} # 145 = single left quote
231 elsif ($code == 0x92) {$code=0x2019} # 146 = single right quote
232 elsif ($code == 0x93) {$code=0x201c} # 147 = double left quote
233 elsif ($code == 0x94) {$code=0x201d} # 148 = double right quote
234 # ...
235 }
236 return &unicode::unicode2utf8([$code]);
237 }
238
239 # a named character entity
240 if (defined $charnetosf{$entity}) {
241 return &unicode::unicode2utf8([$charnetosf{$entity}]);
242 }
243
244 # a named symbol entity
245 if ($convertsymbols && defined $symnetosf{$entity}) {
246 return &unicode::unicode2utf8([$symnetosf{$entity}]);
247 }
248
249 return "&$entity;"; # unknown character
250}
251
252# convert character entities from named equivalents to html font
253sub convertcharentities {
254 # args: the text that you want to convert
255
256 $_[0] =~ s/&([^;]+);/&getcharequiv($1,0)/gse;
257}
258
259# convert any entities from named equivalents to html font
260sub convertallentities {
261 # args: the text that you want to convert
262
263 $_[0] =~ s/&([^;]+);/&getcharequiv($1,1)/gse;
264}
265
266sub html2txt {
267 # args: the text that you want converted to ascii,
268 # and whether to strip out sgml tags
269
270 # strip out sgml tags if needed
271 $_[0] =~ s/<[^>]*>//g if $_[1];
272
273 # convert the char entities to the standard html font
274 &convertcharentities($_[0]);
275
276 # convert the html character set to a plain ascii character set
277 my $pos = 0;
278 while ($pos < length($_[0])) {
279 my $charnum = ord(substr($_[0], $pos, 1));
280 if ($charnum >= 32) { # only convert characters above #32
281 my $replacechars = " ";
282 $replacechars = $sftotxt{$charnum} if defined $sftotxt{$charnum};
283 substr($_[0], $pos, 1) = $replacechars;
284 $pos += length ($replacechars);
285
286 } else {
287 $pos ++;
288 }
289 }
290}
291
292
293# look for mime.types (eg in /etc, or apache/conf directories), or have a look
294# at <ftp://ftp.iana.org/in-notes/iana/assignments/media-types/> for defaults.
295sub guess_mime_type {
296 my ($filename) = @_;
297 # make the filename lowercase, since the mimetypes hashmap looks for lowercase
298 $filename = lc($filename);
299
300 my ($fileext) = $filename =~ /\.(\w+)$/;
301 return "unknown" unless defined $fileext;
302
303 # else
304 my $mimetype = $mime_type{$fileext};
305 return $mimetype if (defined $mimetype);
306
307 return "unknown";
308}
309
310
3111;
Note: See TracBrowser for help on using the repository browser.