source: main/trunk/greenstone2/perllib/ghtml.pm@ 23182

Last change on this file since 23182 was 22952, checked in by davidb, 14 years ago

Encode::decode cannot be applied to all characters returned by ghtml::getcharequiv(). If getcharequiv does not recognize a character then it does not encode it, and so we cannot apply decode() to it. getcharequiv() upgraded to include an optional extra param that says whether or not to decode the equiv char should there be one it can map it to.

  • Property svn:keywords set to Author Date Id Revision
File size: 13.2 KB
Line 
1###########################################################################
2#
3# ghtml.pm -- this used to be called html.pm but it clashed
4# with the existing html module under windows
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package ghtml;
29
30use strict;
31use unicode;
32
33# htmlsafe(TEXT)
34#
35# Converts SGML meta characters in TEXT to entity references.
36#
37sub htmlsafe
38{
39 $_[0] =~ s/&/&/osg;
40 $_[0] =~ s/</&lt;/osg;
41 $_[0] =~ s/>/&gt;/osg;
42 $_[0] =~ s/\"/&quot;/osg;
43}
44
45# unescape_html(TEXT)
46#
47# Converts HTML entities into their original form.
48#
49sub unescape_html
50{
51 my ($html) = @_;
52
53 $html =~ s/&amp;/&/osg;
54 $html =~ s/&lt;/</osg;
55 $html =~ s/&gt;/>/osg;
56 $html =~ s/&quot;/\"/osg;
57
58 return $html;
59}
60
61# urlsafe(TEXT)
62#
63# Converts characters not allowed in a URL to their hex representation.
64#
65sub urlsafe
66{
67 $_[0] =~ s/[\x09\x20\x22\x3c\x3e\x5b\x5c\x5d\x5e\x60\x7b\x7c\x7d\x7e\?\=\&\+_\/]/sprintf("%%%2x", ord($&))/gse;
68}
69
70
71# named entry to the standard html font
72my %charnetosf = ("Agrave"=> "192", "Aacute"=> "193", "Acirc" => "194", "Atilde"=> "195",
73 "Auml" => "196", "Aring" => "197", "AElig" => "198", "Ccedil"=> "199",
74 "Egrave"=> "200", "Eacute"=> "201", "Ecirc" => "202", "Euml" => "203",
75 "Igrave"=> "204", "Iacute"=> "205", "Icirc" => "206", "Iuml" => "207",
76 "ETH" => "208", "Ntilde"=> "209", "Ograve"=> "210", "Oacute"=> "211",
77 "Ocirc" => "212", "Otilde"=> "213", "Ouml" => "214",
78 "Oslash"=> "216", "Ugrave"=> "217", "Uacute"=> "218", "Ucirc" => "219",
79 "Uuml" => "220", "Yacute"=> "221", "THORN" => "222", "szlig" => "223",
80 "agrave"=> "224", "aacute"=> "225", "acirc" => "226", "atilde"=> "227",
81 "auml" => "228", "aring" => "229", "aelig" => "230", "ccedil"=> "231",
82 "egrave"=> "232", "eacute"=> "233", "ecirc" => "234", "euml" => "235",
83 "igrave"=> "236", "iacute"=> "237", "icirc" => "238", "iuml" => "239",
84 "eth" => "240", "ntilde"=> "241", "ograve"=> "242", "oacute"=> "243",
85 "ocirc" => "244", "otilde"=> "245", "ouml" => "246",
86 "oslash"=> "248", "ugrave"=> "249", "uacute"=> "250", "ucirc" => "251",
87 "uuml" => "252", "yacute"=> "253", "thorn" => "254", "yuml" => "255");
88
89my %symnetosf = ("quot" => "34", "amp" => "38", "lt" => "60", "gt" => "62",
90 "nbsp" => "160", "iexcl" => "161", "cent" => "162", "pound" => "163",
91 "curren"=> "164", "yen" => "165", "brvbar"=> "166", "sect" => "167",
92 "uml" => "168", "copy" => "169", "ordf" => "170", "laquo" => "171",
93 "not" => "172", "shy" => "173", "reg" => "174", "macr" => "175",
94 "deg" => "176", "plusmn"=> "177", "sup2" => "178", "sup3" => "179",
95 "acute" => "180", "micro" => "181", "para" => "182", "middot"=> "183",
96 "cedil" => "184", "sup1" => "185", "ordm" => "186", "raquo" => "187",
97 "frac14"=> "188", "frac12"=> "189", "frac34"=> "190", "iquest"=> "191",
98 "times" => "215", "divide"=> "247");
99
100
101
102# standard font to plain text
103my %sftotxt = ("32" => " ", "33" => "!", "34" => "\"", "35" => "\#", "36" => "\$",
104 "37" => "\%", "38" => "&", "39" => "'", "40" => "(", "41" => ")",
105 "42" => "*", "43" => "+", "44" => ",", "45" => "-", "46" => ".",
106 "47" => "/", "48" => "0", "49" => "1", "50" => "2", "51" => "3",
107 "52" => "4", "53" => "5", "54" => "6", "55" => "7", "56" => "8",
108 "57" => "9", "58" => ":", "59" => ";", "60" => "<", "61" => "=",
109 "62" => ">", "63" => "?", "64" => "\@", "65" => "A", "66" => "B",
110 "57" => "9", "58" => ":", "59" => ";", "61" => "=",
111 "63" => "?", "64" => "\@", "65" => "A", "66" => "B",
112 "67" => "C", "68" => "D", "69" => "E", "70" => "F", "71" => "G",
113 "72" => "H", "73" => "I", "74" => "J", "75" => "K", "76" => "L",
114 "77" => "M", "78" => "N", "79" => "O", "80" => "P", "81" => "Q",
115 "82" => "R", "83" => "S", "84" => "T", "85" => "U", "86" => "V",
116 "87" => "W", "88" => "X", "89" => "Y", "90" => "Z", "91" => "[",
117 "92" => "\\", "93" => "]", "94" => "^", "95" => "_", "96" => "`",
118 "97" => "a", "98" => "b", "99" => "c", "100" => "d", "101" => "e",
119 "102" => "f", "103" => "g", "104" => "h", "105" => "i", "106" => "j",
120 "107" => "k", "108" => "l", "109" => "m", "110" => "n", "111" => "o",
121 "112" => "p", "113" => "q", "114" => "r", "115" => "s", "116" => "t",
122 "117" => "u", "118" => "v", "119" => "w", "120" => "x", "121" => "y",
123 "122" => "z", "123" => "{", "124" => "|", "125" => "}", "126" => "~",
124 "130" => ",", "131" => "f", "132" => "\"", "133" => "...", "139" => "<",
125 "140" => "OE", "145" => "'", "146" => "'", "147" => "\"", "148" => "\"",
126 "149" => "o", "150" => "--", "151" => "-", "152" => "~", "153" => "TM",
127 "155" => ">", "156" => "oe", "159" => "Y", "160" => " ", "178" => "2",
128 "179" => "3", "185" => "1", "188" => "1/4", "189" => "1/2", "190" => "3/4",
129 "192" => "A", "193" => "A", "194" => "A", "195" => "A", "196" => "A",
130 "197" => "A", "198" => "AE", "199" => "C", "200" => "E", "201" => "E",
131 "202" => "E", "203" => "E", "204" => "I", "205" => "I", "206" => "I",
132 "207" => "I", "208" => "D", "209" => "N", "210" => "O", "211" => "O",
133 "212" => "O", "213" => "O", "214" => "O", "215" => "*", "216" => "O",
134 "217" => "U", "218" => "U", "219" => "U", "220" => "U", "221" => "Y",
135 "223" => "ss", "224" => "a", "225" => "a", "226" => "a", "227" => "a",
136 "228" => "a", "229" => "a", "230" => "ae", "231" => "c", "232" => "e",
137 "233" => "e", "234" => "e", "235" => "e", "236" => "i", "237" => "i",
138 "238" => "i", "239" => "i", "241" => "n", "242" => "o", "243" => "o",
139 "244" => "o", "245" => "o", "246" => "o", "247" => "/", "248" => "o",
140 "249" => "u", "250" => "u", "251" => "u", "252" => "u", "253" => "y",
141 "255" => "y", "8218" => ",");
142
143
144my %mime_type = ("ai"=>"application/postscript", "aif"=>"audio/x-aiff",
145 "aifc"=>"audio/x-aiff", "aiff"=>"audio/x-aiff",
146 "au"=>"audio/basic", "avi"=>"video/x-msvideo",
147 "bcpio"=>"application/x-bcpio", "bin"=>"application/octet-stream",
148 "cdf"=>"application/x-netcdf", "class"=>"application/octet-stream",
149 "cpio"=>"application/x-cpio", "cpt"=>"application/mac-compactpro",
150 "csh"=>"application/x-csh", "dcr"=>"application/x-director",
151 "dir"=>"application/x-director", "dms"=>"application/octet-stream",
152 "doc"=>"application/msword", "dvi"=>"application/x-dvi",
153 "dxr"=>"application/x-director", "eps"=>"application/postscript",
154 "etx"=>"text/x-setext",
155 "exe"=>"application/octet-stream", "gif"=>"image/gif",
156 "gtar"=>"application/x-gtar", "hdf"=>"application/x-hdf",
157 "hqx"=>"application/mac-binhex40", "htm"=>"text/html",
158 "html"=>"text/html", "ice"=>"x-conference/x-cooltalk",
159 "ief"=>"image/ief", "jpe"=>"image/jpeg",
160 "jpeg"=>"image/jpeg", "jpg"=>"image/jpeg",
161 "kar"=>"audio/midi", "latex"=>"application/x-latex",
162 "lha"=>"application/octet-stream", "lzh"=>"application/octet-stream",
163 "man"=>"application/x-troff-man", "mcf"=>"image/vasa",
164 "me"=>"application/x-troff-me", "mid"=>"audio/midi",
165 "midi"=>"audio/midi", "mif"=>"application/x-mif",
166 "mov"=>"video/quicktime", "movie"=>"video/x-sgi-movie",
167 "mp2"=>"audio/mpeg", "mpe"=>"video/mpeg",
168 "mpeg"=>"video/mpeg", "mpg"=>"video/mpeg",
169 "mpga"=>"audio/mpeg", "ms"=>"application/x-troff-ms",
170 "nc"=>"application/x-netcdf", "oda"=>"application/oda",
171 "pbm"=>"image/x-portable-bitmap", "pdb"=>"chemical/x-pdb",
172 "pdf"=>"application/pdf", "pgm"=>"image/x-portable-graymap",
173 "png"=>"image/png", "pnm"=>"image/x-portable-anymap",
174 "ppm"=>"image/x-portable-pixmap",
175 "ppt"=>"application/vnd.ms-powerpoint",
176 "ps"=>"application/postscript", "qt"=>"video/quicktime",
177 "ra"=>"audio/x-realaudio", "ram"=>"audio/x-pn-realaudio",
178 "ras"=>"image/x-cmu-raster", "rgb"=>"image/x-rgb",
179 "roff"=>"application/x-troff", "rpm"=>"audio/x-pn-realaudio-plugin",
180 "rtf"=>"application/rtf", "rtx"=>"text/richtext",
181 "sgm"=>"text/x-sgml", "sgml"=>"text/x-sgml",
182 "sh"=>"application/x-sh", "shar"=>"application/x-shar",
183 "sit"=>"application/x-stuffit", "skd"=>"application/x-koan",
184 "skm"=>"application/x-koan", "skp"=>"application/x-koan",
185 "skt"=>"application/x-koan", "snd"=>"audio/basic",
186 "src"=>"application/x-wais-source", "sv4cpio"=>"application/x-sv4cpio",
187 "sv4crc"=>"application/x-sv4crc", "t"=>"application/x-troff",
188 "tar"=>"application/x-tar", "tcl"=>"application/x-tcl",
189 "tex"=>"application/x-tex", "texi"=>"application/x-texinfo",
190 "texinfo"=>"application/x-texinfo", "tif"=>"image/tiff",
191 "tiff"=>"image/tiff", "tr"=>"application/x-troff",
192 "tsv"=>"text/tab-separated-values", "txt"=>"text/plain",
193 "ustar"=>"application/x-ustar", "vcd"=>"application/x-cdlink",
194 "vrml"=>"x-world/x-vrml", "wav"=>"audio/x-wav",
195 "wrl"=>"x-world/x-vrml", "xbm"=>"image/x-xbitmap",
196 "xls"=>"application/vnd.ms-excel",
197 "xpm"=>"image/x-xpixmap", "xwd"=>"image/x-xwindowdump",
198 "xyz"=>"chemical/x-pdb", "zip"=>"application/zip");
199
200
201# returns the character as a raw utf-8 character. It assumes that the
202# & and ; have been stripped off the string.
203sub getcharequiv {
204 my ($entity, $convertsymbols, $and_decode) = @_;
205
206 my $char_equiv = undef;
207
208 # a numeric entity
209 if ($entity =~ /^\#0*(\d+)/) {
210 my $code=$1;
211 # non-standard Microsoft breakage, as usual
212 if ($code < 0x9f) { # code page 1252 uses reserved bytes
213 if ($code == 0x91) {$code=0x2018} # 145 = single left quote
214 elsif ($code == 0x92) {$code=0x2019} # 146 = single right quote
215 elsif ($code == 0x93) {$code=0x201c} # 147 = double left quote
216 elsif ($code == 0x94) {$code=0x201d} # 148 = double right quote
217 # ...
218 }
219 $char_equiv = &unicode::unicode2utf8([$code]);
220 }
221
222 # a named character entity
223 elsif (defined $charnetosf{$entity}) {
224 $char_equiv = &unicode::unicode2utf8([$charnetosf{$entity}]);
225 }
226
227 # a named symbol entity
228 elsif ($convertsymbols && defined $symnetosf{$entity}) {
229 $char_equiv = &unicode::unicode2utf8([$symnetosf{$entity}]);
230 }
231
232 if (!defined $char_equiv) {
233 return "&$entity;"; # unknown character
234 }
235 else {
236 if ((defined $and_decode) && ($and_decode)) {
237 $char_equiv = Encode::decode("utf8",$char_equiv);
238 }
239 return $char_equiv;
240 }
241}
242
243# convert character entities from named equivalents to html font
244sub convertcharentities {
245 # args: the text that you want to convert
246
247 $_[0] =~ s/&([^;]+);/&getcharequiv($1,0)/gse;
248}
249
250# convert any entities from named equivalents to html font
251sub convertallentities {
252 # args: the text that you want to convert
253
254 $_[0] =~ s/&([^;]+);/&getcharequiv($1,1)/gse;
255}
256
257sub html2txt {
258 # args: the text that you want converted to ascii,
259 # and whether to strip out sgml tags
260
261 # strip out sgml tags if needed
262 $_[0] =~ s/<[^>]*>//g if $_[1];
263
264 # convert the char entities to the standard html font
265 &convertcharentities($_[0]);
266
267 # convert the html character set to a plain ascii character set
268 my $pos = 0;
269 while ($pos < length($_[0])) {
270 my $charnum = ord(substr($_[0], $pos, 1));
271 if ($charnum >= 32) { # only convert characters above #32
272 my $replacechars = " ";
273 $replacechars = $sftotxt{$charnum} if defined $sftotxt{$charnum};
274 substr($_[0], $pos, 1) = $replacechars;
275 $pos += length ($replacechars);
276
277 } else {
278 $pos ++;
279 }
280 }
281}
282
283
284# look for mime.types (eg in /etc, or apache/conf directories), or have a look
285# at <ftp://ftp.iana.org/in-notes/iana/assignments/media-types/> for defaults.
286sub guess_mime_type {
287 my ($filename) = @_;
288 # make the filename lowercase, since the mimetypes hashmap looks for lowercase
289 $filename = lc($filename);
290
291 my ($fileext) = $filename =~ /\.(\w+)$/;
292 return "unknown" unless defined $fileext;
293
294 # else
295 my $mimetype = $mime_type{$fileext};
296 return $mimetype if (defined $mimetype);
297
298 return "unknown";
299}
300
301
3021;
Note: See TracBrowser for help on using the repository browser.