source: main/trunk/greenstone2/perllib/ghtml.pm@ 23362

Last change on this file since 23362 was 23362, checked in by davidb, 13 years ago

Additional routines (and few upgraded) to help support Greenstone working with filenames under Windows when then go beyond Latin-1 and start turning up in their DOS abbreviated form (e.g. Test~1.txt)

  • Property svn:keywords set to Author Date Id Revision
File size: 13.3 KB
Line 
1###########################################################################
2#
3# ghtml.pm -- this used to be called html.pm but it clashed
4# with the existing html module under windows
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28package ghtml;
29
30use strict;
31use unicode;
32
33# htmlsafe(TEXT)
34#
35# Converts SGML meta characters in TEXT to entity references.
36#
37sub htmlsafe
38{
39 $_[0] =~ s/&/&/osg;
40 $_[0] =~ s/</&lt;/osg;
41 $_[0] =~ s/>/&gt;/osg;
42 $_[0] =~ s/\"/&quot;/osg;
43}
44
45# unescape_html(TEXT)
46#
47# Converts HTML entities into their original form.
48#
49sub unescape_html
50{
51 my ($html) = @_;
52
53 $html =~ s/&amp;/&/osg;
54 $html =~ s/&lt;/</osg;
55 $html =~ s/&gt;/>/osg;
56 $html =~ s/&quot;/\"/osg;
57
58 return $html;
59}
60
61# urlsafe(TEXT)
62#
63# Converts characters not allowed in a URL to their hex representation.
64#
65sub urlsafe
66{
67 $_[0] =~ s/[\x09\x20\x22\x3c\x3e\x5b\x5c\x5d\x5e\x60\x7b\x7c\x7d\x7e\?\=\&\+_\/]/sprintf("%%%2x", ord($&))/gse;
68}
69
70
71# named entry to the standard html font
72my %charnetosf = ("Agrave"=> "192", "Aacute"=> "193", "Acirc" => "194", "Atilde"=> "195",
73 "Auml" => "196", "Aring" => "197", "AElig" => "198", "Ccedil"=> "199",
74 "Egrave"=> "200", "Eacute"=> "201", "Ecirc" => "202", "Euml" => "203",
75 "Igrave"=> "204", "Iacute"=> "205", "Icirc" => "206", "Iuml" => "207",
76 "ETH" => "208", "Ntilde"=> "209", "Ograve"=> "210", "Oacute"=> "211",
77 "Ocirc" => "212", "Otilde"=> "213", "Ouml" => "214",
78 "Oslash"=> "216", "Ugrave"=> "217", "Uacute"=> "218", "Ucirc" => "219",
79 "Uuml" => "220", "Yacute"=> "221", "THORN" => "222", "szlig" => "223",
80 "agrave"=> "224", "aacute"=> "225", "acirc" => "226", "atilde"=> "227",
81 "auml" => "228", "aring" => "229", "aelig" => "230", "ccedil"=> "231",
82 "egrave"=> "232", "eacute"=> "233", "ecirc" => "234", "euml" => "235",
83 "igrave"=> "236", "iacute"=> "237", "icirc" => "238", "iuml" => "239",
84 "eth" => "240", "ntilde"=> "241", "ograve"=> "242", "oacute"=> "243",
85 "ocirc" => "244", "otilde"=> "245", "ouml" => "246",
86 "oslash"=> "248", "ugrave"=> "249", "uacute"=> "250", "ucirc" => "251",
87 "uuml" => "252", "yacute"=> "253", "thorn" => "254", "yuml" => "255");
88
89my %symnetosf = ("quot" => "34", "amp" => "38", "lt" => "60", "gt" => "62",
90 "nbsp" => "160", "iexcl" => "161", "cent" => "162", "pound" => "163",
91 "curren"=> "164", "yen" => "165", "brvbar"=> "166", "sect" => "167",
92 "uml" => "168", "copy" => "169", "ordf" => "170", "laquo" => "171",
93 "not" => "172", "shy" => "173", "reg" => "174", "macr" => "175",
94 "deg" => "176", "plusmn"=> "177", "sup2" => "178", "sup3" => "179",
95 "acute" => "180", "micro" => "181", "para" => "182", "middot"=> "183",
96 "cedil" => "184", "sup1" => "185", "ordm" => "186", "raquo" => "187",
97 "frac14"=> "188", "frac12"=> "189", "frac34"=> "190", "iquest"=> "191",
98 "times" => "215", "divide"=> "247");
99
100
101
102# standard font to plain text
103my %sftotxt = ("32" => " ", "33" => "!", "34" => "\"", "35" => "\#", "36" => "\$",
104 "37" => "\%", "38" => "&", "39" => "'", "40" => "(", "41" => ")",
105 "42" => "*", "43" => "+", "44" => ",", "45" => "-", "46" => ".",
106 "47" => "/", "48" => "0", "49" => "1", "50" => "2", "51" => "3",
107 "52" => "4", "53" => "5", "54" => "6", "55" => "7", "56" => "8",
108 "57" => "9", "58" => ":", "59" => ";", "60" => "<", "61" => "=",
109 "62" => ">", "63" => "?", "64" => "\@", "65" => "A", "66" => "B",
110 "57" => "9", "58" => ":", "59" => ";", "61" => "=",
111 "63" => "?", "64" => "\@", "65" => "A", "66" => "B",
112 "67" => "C", "68" => "D", "69" => "E", "70" => "F", "71" => "G",
113 "72" => "H", "73" => "I", "74" => "J", "75" => "K", "76" => "L",
114 "77" => "M", "78" => "N", "79" => "O", "80" => "P", "81" => "Q",
115 "82" => "R", "83" => "S", "84" => "T", "85" => "U", "86" => "V",
116 "87" => "W", "88" => "X", "89" => "Y", "90" => "Z", "91" => "[",
117 "92" => "\\", "93" => "]", "94" => "^", "95" => "_", "96" => "`",
118 "97" => "a", "98" => "b", "99" => "c", "100" => "d", "101" => "e",
119 "102" => "f", "103" => "g", "104" => "h", "105" => "i", "106" => "j",
120 "107" => "k", "108" => "l", "109" => "m", "110" => "n", "111" => "o",
121 "112" => "p", "113" => "q", "114" => "r", "115" => "s", "116" => "t",
122 "117" => "u", "118" => "v", "119" => "w", "120" => "x", "121" => "y",
123 "122" => "z", "123" => "{", "124" => "|", "125" => "}", "126" => "~",
124 "130" => ",", "131" => "f", "132" => "\"", "133" => "...", "139" => "<",
125 "140" => "OE", "145" => "'", "146" => "'", "147" => "\"", "148" => "\"",
126 "149" => "o", "150" => "--", "151" => "-", "152" => "~", "153" => "TM",
127 "155" => ">", "156" => "oe", "159" => "Y", "160" => " ", "178" => "2",
128 "179" => "3", "185" => "1", "188" => "1/4", "189" => "1/2", "190" => "3/4",
129 "192" => "A", "193" => "A", "194" => "A", "195" => "A", "196" => "A",
130 "197" => "A", "198" => "AE", "199" => "C", "200" => "E", "201" => "E",
131 "202" => "E", "203" => "E", "204" => "I", "205" => "I", "206" => "I",
132 "207" => "I", "208" => "D", "209" => "N", "210" => "O", "211" => "O",
133 "212" => "O", "213" => "O", "214" => "O", "215" => "*", "216" => "O",
134 "217" => "U", "218" => "U", "219" => "U", "220" => "U", "221" => "Y",
135 "223" => "ss", "224" => "a", "225" => "a", "226" => "a", "227" => "a",
136 "228" => "a", "229" => "a", "230" => "ae", "231" => "c", "232" => "e",
137 "233" => "e", "234" => "e", "235" => "e", "236" => "i", "237" => "i",
138 "238" => "i", "239" => "i", "241" => "n", "242" => "o", "243" => "o",
139 "244" => "o", "245" => "o", "246" => "o", "247" => "/", "248" => "o",
140 "249" => "u", "250" => "u", "251" => "u", "252" => "u", "253" => "y",
141 "255" => "y", "8218" => ",");
142
143
144my %mime_type = ("ai"=>"application/postscript", "aif"=>"audio/x-aiff",
145 "aifc"=>"audio/x-aiff", "aiff"=>"audio/x-aiff",
146 "au"=>"audio/basic", "avi"=>"video/x-msvideo",
147 "bcpio"=>"application/x-bcpio", "bin"=>"application/octet-stream",
148 "cdf"=>"application/x-netcdf", "class"=>"application/octet-stream",
149 "cpio"=>"application/x-cpio", "cpt"=>"application/mac-compactpro",
150 "csh"=>"application/x-csh", "dcr"=>"application/x-director",
151 "dir"=>"application/x-director", "dms"=>"application/octet-stream",
152 "doc"=>"application/msword", "dvi"=>"application/x-dvi",
153 "dxr"=>"application/x-director", "eps"=>"application/postscript",
154 "etx"=>"text/x-setext",
155 "exe"=>"application/octet-stream", "gif"=>"image/gif",
156 "gtar"=>"application/x-gtar", "hdf"=>"application/x-hdf",
157 "hqx"=>"application/mac-binhex40", "htm"=>"text/html",
158 "html"=>"text/html", "ice"=>"x-conference/x-cooltalk",
159 "ief"=>"image/ief", "jpe"=>"image/jpeg",
160 "jpeg"=>"image/jpeg", "jpg"=>"image/jpeg",
161 "kar"=>"audio/midi", "latex"=>"application/x-latex",
162 "lha"=>"application/octet-stream", "lzh"=>"application/octet-stream",
163 "man"=>"application/x-troff-man", "mcf"=>"image/vasa",
164 "me"=>"application/x-troff-me", "mid"=>"audio/midi",
165 "midi"=>"audio/midi", "mif"=>"application/x-mif",
166 "mov"=>"video/quicktime", "movie"=>"video/x-sgi-movie",
167 "mp2"=>"audio/mpeg", "mpe"=>"video/mpeg",
168 "mpeg"=>"video/mpeg", "mpg"=>"video/mpeg",
169 "mpga"=>"audio/mpeg", "ms"=>"application/x-troff-ms",
170 "nc"=>"application/x-netcdf", "oda"=>"application/oda",
171 "pbm"=>"image/x-portable-bitmap", "pdb"=>"chemical/x-pdb",
172 "pdf"=>"application/pdf", "pgm"=>"image/x-portable-graymap",
173 "png"=>"image/png", "pnm"=>"image/x-portable-anymap",
174 "ppm"=>"image/x-portable-pixmap",
175 "ppt"=>"application/vnd.ms-powerpoint",
176 "ps"=>"application/postscript", "qt"=>"video/quicktime",
177 "ra"=>"audio/x-realaudio", "ram"=>"audio/x-pn-realaudio",
178 "ras"=>"image/x-cmu-raster", "rgb"=>"image/x-rgb",
179 "roff"=>"application/x-troff", "rpm"=>"audio/x-pn-realaudio-plugin",
180 "rtf"=>"application/rtf", "rtx"=>"text/richtext",
181 "sgm"=>"text/x-sgml", "sgml"=>"text/x-sgml",
182 "sh"=>"application/x-sh", "shar"=>"application/x-shar",
183 "sit"=>"application/x-stuffit", "skd"=>"application/x-koan",
184 "skm"=>"application/x-koan", "skp"=>"application/x-koan",
185 "skt"=>"application/x-koan", "snd"=>"audio/basic",
186 "src"=>"application/x-wais-source", "sv4cpio"=>"application/x-sv4cpio",
187 "sv4crc"=>"application/x-sv4crc", "t"=>"application/x-troff",
188 "tar"=>"application/x-tar", "tcl"=>"application/x-tcl",
189 "tex"=>"application/x-tex", "texi"=>"application/x-texinfo",
190 "texinfo"=>"application/x-texinfo", "tif"=>"image/tiff",
191 "tiff"=>"image/tiff", "tr"=>"application/x-troff",
192 "tsv"=>"text/tab-separated-values", "txt"=>"text/plain",
193 "ustar"=>"application/x-ustar", "vcd"=>"application/x-cdlink",
194 "vrml"=>"x-world/x-vrml", "wav"=>"audio/x-wav",
195 "wrl"=>"x-world/x-vrml", "xbm"=>"image/x-xbitmap",
196 "xls"=>"application/vnd.ms-excel",
197 "xpm"=>"image/x-xpixmap", "xwd"=>"image/x-xwindowdump",
198 "xyz"=>"chemical/x-pdb", "zip"=>"application/zip");
199
200
201# returns the character as a raw utf-8 character. It assumes that the
202# & and ; have been stripped off the string.
203sub getcharequiv {
204 my ($entity, $convertsymbols, $and_decode) = @_;
205
206 my $char_equiv = undef;
207
208 # a numeric entity
209 my $code = undef;
210 if ($entity =~ m/^\#0*(\d+)$/) {
211 $code=$1;
212 }
213 elsif ($entity =~ m/^\#x([0-9A-F]+)$/i) {
214 $code=hex($1);
215 }
216
217 if (defined $code) {
218 # non-standard Microsoft breakage, as usual
219 if ($code < 0x9f) { # code page 1252 uses reserved bytes
220 if ($code == 0x91) {$code=0x2018} # 145 = single left quote
221 elsif ($code == 0x92) {$code=0x2019} # 146 = single right quote
222 elsif ($code == 0x93) {$code=0x201c} # 147 = double left quote
223 elsif ($code == 0x94) {$code=0x201d} # 148 = double right quote
224 # ...
225 }
226 $char_equiv = &unicode::unicode2utf8([$code]);
227 }
228
229 # a named character entity
230 elsif (defined $charnetosf{$entity}) {
231 $char_equiv = &unicode::unicode2utf8([$charnetosf{$entity}]);
232 }
233
234 # a named symbol entity
235 elsif ($convertsymbols && defined $symnetosf{$entity}) {
236 $char_equiv = &unicode::unicode2utf8([$symnetosf{$entity}]);
237 }
238
239 if (!defined $char_equiv) {
240 return "&$entity;"; # unknown character
241 }
242 else {
243 if ((defined $and_decode) && ($and_decode)) {
244 $char_equiv = Encode::decode("utf8",$char_equiv);
245 }
246 return $char_equiv;
247 }
248}
249
250# convert character entities from named equivalents to html font
251sub convertcharentities {
252 # args: the text that you want to convert
253
254 $_[0] =~ s/&([^;]+);/&getcharequiv($1,0)/gse;
255}
256
257# convert any entities from named equivalents to html font
258sub convertallentities {
259 # args: the text that you want to convert
260
261 $_[0] =~ s/&([^;]+);/&getcharequiv($1,1)/gse;
262}
263
264sub html2txt {
265 # args: the text that you want converted to ascii,
266 # and whether to strip out sgml tags
267
268 # strip out sgml tags if needed
269 $_[0] =~ s/<[^>]*>//g if $_[1];
270
271 # convert the char entities to the standard html font
272 &convertcharentities($_[0]);
273
274 # convert the html character set to a plain ascii character set
275 my $pos = 0;
276 while ($pos < length($_[0])) {
277 my $charnum = ord(substr($_[0], $pos, 1));
278 if ($charnum >= 32) { # only convert characters above #32
279 my $replacechars = " ";
280 $replacechars = $sftotxt{$charnum} if defined $sftotxt{$charnum};
281 substr($_[0], $pos, 1) = $replacechars;
282 $pos += length ($replacechars);
283
284 } else {
285 $pos ++;
286 }
287 }
288}
289
290
291# look for mime.types (eg in /etc, or apache/conf directories), or have a look
292# at <ftp://ftp.iana.org/in-notes/iana/assignments/media-types/> for defaults.
293sub guess_mime_type {
294 my ($filename) = @_;
295 # make the filename lowercase, since the mimetypes hashmap looks for lowercase
296 $filename = lc($filename);
297
298 my ($fileext) = $filename =~ /\.(\w+)$/;
299 return "unknown" unless defined $fileext;
300
301 # else
302 my $mimetype = $mime_type{$fileext};
303 return $mimetype if (defined $mimetype);
304
305 return "unknown";
306}
307
308
3091;
Note: See TracBrowser for help on using the repository browser.