source: main/trunk/greenstone2/perllib/ghtml.pm@ 22653

Last change on this file since 22653 was 22653, checked in by mdewsnip, 14 years ago

Removed dmsafe() function from ghtml.pm. This shouldn't be done in the build time code because it is a runtime issue -- if you aren't using the Greenstone 2 receptionist (e.g OAI server, Greenstone 3) then escaping underscores just creates problems.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.9 KB
Line 
1###########################################################################
2#
3# ghtml.pm -- this used to be called html.pm but it clashed
4# with the existing html module under windows
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package ghtml;
29
30use strict;
31use unicode;
32
33# htmlsafe(TEXT)
34#
35# Converts SGML meta characters in TEXT to entity references.
36#
37sub htmlsafe
38{
39 $_[0] =~ s/&/&/osg;
40 $_[0] =~ s/</&lt;/osg;
41 $_[0] =~ s/>/&gt;/osg;
42 $_[0] =~ s/\"/&quot;/osg;
43}
44
45# unescape_html(TEXT)
46#
47# Converts HTML entities into their original form.
48#
49sub unescape_html
50{
51 my ($html) = @_;
52
53 $html =~ s/&amp;/&/osg;
54 $html =~ s/&lt;/</osg;
55 $html =~ s/&gt;/>/osg;
56 $html =~ s/&quot;/\"/osg;
57
58 return $html;
59}
60
61# urlsafe(TEXT)
62#
63# Converts characters not allowed in a URL to their hex representation.
64#
65sub urlsafe
66{
67 $_[0] =~ s/[\x09\x20\x22\x3c\x3e\x5b\x5c\x5d\x5e\x60\x7b\x7c\x7d\x7e\?\=\&\+_\/]/sprintf("%%%2x", ord($&))/gse;
68}
69
70
71# named entry to the standard html font
72my %charnetosf = ("Agrave"=> "192", "Aacute"=> "193", "Acirc" => "194", "Atilde"=> "195",
73 "Auml" => "196", "Aring" => "197", "AElig" => "198", "Ccedil"=> "199",
74 "Egrave"=> "200", "Eacute"=> "201", "Ecirc" => "202", "Euml" => "203",
75 "Igrave"=> "204", "Iacute"=> "205", "Icirc" => "206", "Iuml" => "207",
76 "ETH" => "208", "Ntilde"=> "209", "Ograve"=> "210", "Oacute"=> "211",
77 "Ocirc" => "212", "Otilde"=> "213", "Ouml" => "214",
78 "Oslash"=> "216", "Ugrave"=> "217", "Uacute"=> "218", "Ucirc" => "219",
79 "Uuml" => "220", "Yacute"=> "221", "THORN" => "222", "szlig" => "223",
80 "agrave"=> "224", "aacute"=> "225", "acirc" => "226", "atilde"=> "227",
81 "auml" => "228", "aring" => "229", "aelig" => "230", "ccedil"=> "231",
82 "egrave"=> "232", "eacute"=> "233", "ecirc" => "234", "euml" => "235",
83 "igrave"=> "236", "iacute"=> "237", "icirc" => "238", "iuml" => "239",
84 "eth" => "240", "ntilde"=> "241", "ograve"=> "242", "oacute"=> "243",
85 "ocirc" => "244", "otilde"=> "245", "ouml" => "246",
86 "oslash"=> "248", "ugrave"=> "249", "uacute"=> "250", "ucirc" => "251",
87 "uuml" => "252", "yacute"=> "253", "thorn" => "254", "yuml" => "255");
88
89my %symnetosf = ("quot" => "34", "amp" => "38", "lt" => "60", "gt" => "62",
90 "nbsp" => "160", "iexcl" => "161", "cent" => "162", "pound" => "163",
91 "curren"=> "164", "yen" => "165", "brvbar"=> "166", "sect" => "167",
92 "uml" => "168", "copy" => "169", "ordf" => "170", "laquo" => "171",
93 "not" => "172", "shy" => "173", "reg" => "174", "macr" => "175",
94 "deg" => "176", "plusmn"=> "177", "sup2" => "178", "sup3" => "179",
95 "acute" => "180", "micro" => "181", "para" => "182", "middot"=> "183",
96 "cedil" => "184", "sup1" => "185", "ordm" => "186", "raquo" => "187",
97 "frac14"=> "188", "frac12"=> "189", "frac34"=> "190", "iquest"=> "191",
98 "times" => "215", "divide"=> "247");
99
100
101
102# standard font to plain text
103my %sftotxt = ("32" => " ", "33" => "!", "34" => "\"", "35" => "\#", "36" => "\$",
104 "37" => "\%", "38" => "&", "39" => "'", "40" => "(", "41" => ")",
105 "42" => "*", "43" => "+", "44" => ",", "45" => "-", "46" => ".",
106 "47" => "/", "48" => "0", "49" => "1", "50" => "2", "51" => "3",
107 "52" => "4", "53" => "5", "54" => "6", "55" => "7", "56" => "8",
108 "57" => "9", "58" => ":", "59" => ";", "60" => "<", "61" => "=",
109 "62" => ">", "63" => "?", "64" => "\@", "65" => "A", "66" => "B",
110 "57" => "9", "58" => ":", "59" => ";", "61" => "=",
111 "63" => "?", "64" => "\@", "65" => "A", "66" => "B",
112 "67" => "C", "68" => "D", "69" => "E", "70" => "F", "71" => "G",
113 "72" => "H", "73" => "I", "74" => "J", "75" => "K", "76" => "L",
114 "77" => "M", "78" => "N", "79" => "O", "80" => "P", "81" => "Q",
115 "82" => "R", "83" => "S", "84" => "T", "85" => "U", "86" => "V",
116 "87" => "W", "88" => "X", "89" => "Y", "90" => "Z", "91" => "[",
117 "92" => "\\", "93" => "]", "94" => "^", "95" => "_", "96" => "`",
118 "97" => "a", "98" => "b", "99" => "c", "100" => "d", "101" => "e",
119 "102" => "f", "103" => "g", "104" => "h", "105" => "i", "106" => "j",
120 "107" => "k", "108" => "l", "109" => "m", "110" => "n", "111" => "o",
121 "112" => "p", "113" => "q", "114" => "r", "115" => "s", "116" => "t",
122 "117" => "u", "118" => "v", "119" => "w", "120" => "x", "121" => "y",
123 "122" => "z", "123" => "{", "124" => "|", "125" => "}", "126" => "~",
124 "130" => ",", "131" => "f", "132" => "\"", "133" => "...", "139" => "<",
125 "140" => "OE", "145" => "'", "146" => "'", "147" => "\"", "148" => "\"",
126 "149" => "o", "150" => "--", "151" => "-", "152" => "~", "153" => "TM",
127 "155" => ">", "156" => "oe", "159" => "Y", "160" => " ", "178" => "2",
128 "179" => "3", "185" => "1", "188" => "1/4", "189" => "1/2", "190" => "3/4",
129 "192" => "A", "193" => "A", "194" => "A", "195" => "A", "196" => "A",
130 "197" => "A", "198" => "AE", "199" => "C", "200" => "E", "201" => "E",
131 "202" => "E", "203" => "E", "204" => "I", "205" => "I", "206" => "I",
132 "207" => "I", "208" => "D", "209" => "N", "210" => "O", "211" => "O",
133 "212" => "O", "213" => "O", "214" => "O", "215" => "*", "216" => "O",
134 "217" => "U", "218" => "U", "219" => "U", "220" => "U", "221" => "Y",
135 "223" => "ss", "224" => "a", "225" => "a", "226" => "a", "227" => "a",
136 "228" => "a", "229" => "a", "230" => "ae", "231" => "c", "232" => "e",
137 "233" => "e", "234" => "e", "235" => "e", "236" => "i", "237" => "i",
138 "238" => "i", "239" => "i", "241" => "n", "242" => "o", "243" => "o",
139 "244" => "o", "245" => "o", "246" => "o", "247" => "/", "248" => "o",
140 "249" => "u", "250" => "u", "251" => "u", "252" => "u", "253" => "y",
141 "255" => "y", "8218" => ",");
142
143
144my %mime_type = ("ai"=>"application/postscript", "aif"=>"audio/x-aiff",
145 "aifc"=>"audio/x-aiff", "aiff"=>"audio/x-aiff",
146 "au"=>"audio/basic", "avi"=>"video/x-msvideo",
147 "bcpio"=>"application/x-bcpio", "bin"=>"application/octet-stream",
148 "cdf"=>"application/x-netcdf", "class"=>"application/octet-stream",
149 "cpio"=>"application/x-cpio", "cpt"=>"application/mac-compactpro",
150 "csh"=>"application/x-csh", "dcr"=>"application/x-director",
151 "dir"=>"application/x-director", "dms"=>"application/octet-stream",
152 "doc"=>"application/msword", "dvi"=>"application/x-dvi",
153 "dxr"=>"application/x-director", "eps"=>"application/postscript",
154 "etx"=>"text/x-setext",
155 "exe"=>"application/octet-stream", "gif"=>"image/gif",
156 "gtar"=>"application/x-gtar", "hdf"=>"application/x-hdf",
157 "hqx"=>"application/mac-binhex40", "htm"=>"text/html",
158 "html"=>"text/html", "ice"=>"x-conference/x-cooltalk",
159 "ief"=>"image/ief", "jpe"=>"image/jpeg",
160 "jpeg"=>"image/jpeg", "jpg"=>"image/jpeg",
161 "kar"=>"audio/midi", "latex"=>"application/x-latex",
162 "lha"=>"application/octet-stream", "lzh"=>"application/octet-stream",
163 "man"=>"application/x-troff-man", "mcf"=>"image/vasa",
164 "me"=>"application/x-troff-me", "mid"=>"audio/midi",
165 "midi"=>"audio/midi", "mif"=>"application/x-mif",
166 "mov"=>"video/quicktime", "movie"=>"video/x-sgi-movie",
167 "mp2"=>"audio/mpeg", "mpe"=>"video/mpeg",
168 "mpeg"=>"video/mpeg", "mpg"=>"video/mpeg",
169 "mpga"=>"audio/mpeg", "ms"=>"application/x-troff-ms",
170 "nc"=>"application/x-netcdf", "oda"=>"application/oda",
171 "pbm"=>"image/x-portable-bitmap", "pdb"=>"chemical/x-pdb",
172 "pdf"=>"application/pdf", "pgm"=>"image/x-portable-graymap",
173 "png"=>"image/png", "pnm"=>"image/x-portable-anymap",
174 "ppm"=>"image/x-portable-pixmap",
175 "ppt"=>"application/vnd.ms-powerpoint",
176 "ps"=>"application/postscript", "qt"=>"video/quicktime",
177 "ra"=>"audio/x-realaudio", "ram"=>"audio/x-pn-realaudio",
178 "ras"=>"image/x-cmu-raster", "rgb"=>"image/x-rgb",
179 "roff"=>"application/x-troff", "rpm"=>"audio/x-pn-realaudio-plugin",
180 "rtf"=>"application/rtf", "rtx"=>"text/richtext",
181 "sgm"=>"text/x-sgml", "sgml"=>"text/x-sgml",
182 "sh"=>"application/x-sh", "shar"=>"application/x-shar",
183 "sit"=>"application/x-stuffit", "skd"=>"application/x-koan",
184 "skm"=>"application/x-koan", "skp"=>"application/x-koan",
185 "skt"=>"application/x-koan", "snd"=>"audio/basic",
186 "src"=>"application/x-wais-source", "sv4cpio"=>"application/x-sv4cpio",
187 "sv4crc"=>"application/x-sv4crc", "t"=>"application/x-troff",
188 "tar"=>"application/x-tar", "tcl"=>"application/x-tcl",
189 "tex"=>"application/x-tex", "texi"=>"application/x-texinfo",
190 "texinfo"=>"application/x-texinfo", "tif"=>"image/tiff",
191 "tiff"=>"image/tiff", "tr"=>"application/x-troff",
192 "tsv"=>"text/tab-separated-values", "txt"=>"text/plain",
193 "ustar"=>"application/x-ustar", "vcd"=>"application/x-cdlink",
194 "vrml"=>"x-world/x-vrml", "wav"=>"audio/x-wav",
195 "wrl"=>"x-world/x-vrml", "xbm"=>"image/x-xbitmap",
196 "xls"=>"application/vnd.ms-excel",
197 "xpm"=>"image/x-xpixmap", "xwd"=>"image/x-xwindowdump",
198 "xyz"=>"chemical/x-pdb", "zip"=>"application/zip");
199
200
201# returns the character as a raw utf-8 character. It assumes that the
202# & and ; have been stripped off the string.
203sub getcharequiv {
204 my ($entity, $convertsymbols) = @_;
205
206 # a numeric entity
207 if ($entity =~ /^\#0*(\d+)/) {
208 my $code=$1;
209 # non-standard Microsoft breakage, as usual
210 if ($code < 0x9f) { # code page 1252 uses reserved bytes
211 if ($code == 0x91) {$code=0x2018} # 145 = single left quote
212 elsif ($code == 0x92) {$code=0x2019} # 146 = single right quote
213 elsif ($code == 0x93) {$code=0x201c} # 147 = double left quote
214 elsif ($code == 0x94) {$code=0x201d} # 148 = double right quote
215 # ...
216 }
217 return &unicode::unicode2utf8([$code]);
218 }
219
220 # a named character entity
221 if (defined $charnetosf{$entity}) {
222 return &unicode::unicode2utf8([$charnetosf{$entity}]);
223 }
224
225 # a named symbol entity
226 if ($convertsymbols && defined $symnetosf{$entity}) {
227 return &unicode::unicode2utf8([$symnetosf{$entity}]);
228 }
229
230 return "&$entity;"; # unknown character
231}
232
233# convert character entities from named equivalents to html font
234sub convertcharentities {
235 # args: the text that you want to convert
236
237 $_[0] =~ s/&([^;]+);/&getcharequiv($1,0)/gse;
238}
239
240# convert any entities from named equivalents to html font
241sub convertallentities {
242 # args: the text that you want to convert
243
244 $_[0] =~ s/&([^;]+);/&getcharequiv($1,1)/gse;
245}
246
247sub html2txt {
248 # args: the text that you want converted to ascii,
249 # and whether to strip out sgml tags
250
251 # strip out sgml tags if needed
252 $_[0] =~ s/<[^>]*>//g if $_[1];
253
254 # convert the char entities to the standard html font
255 &convertcharentities($_[0]);
256
257 # convert the html character set to a plain ascii character set
258 my $pos = 0;
259 while ($pos < length($_[0])) {
260 my $charnum = ord(substr($_[0], $pos, 1));
261 if ($charnum >= 32) { # only convert characters above #32
262 my $replacechars = " ";
263 $replacechars = $sftotxt{$charnum} if defined $sftotxt{$charnum};
264 substr($_[0], $pos, 1) = $replacechars;
265 $pos += length ($replacechars);
266
267 } else {
268 $pos ++;
269 }
270 }
271}
272
273
274# look for mime.types (eg in /etc, or apache/conf directories), or have a look
275# at <ftp://ftp.iana.org/in-notes/iana/assignments/media-types/> for defaults.
276sub guess_mime_type {
277 my ($filename) = @_;
278 # make the filename lowercase, since the mimetypes hashmap looks for lowercase
279 $filename = lc($filename);
280
281 my ($fileext) = $filename =~ /\.(\w+)$/;
282 return "unknown" unless defined $fileext;
283
284 # else
285 my $mimetype = $mime_type{$fileext};
286 return $mimetype if (defined $mimetype);
287
288 return "unknown";
289}
290
291
2921;
Note: See TracBrowser for help on using the repository browser.