Changeset 17058 for gsdl/trunk/perllib/ghtml.pm
- Timestamp:
- 2008-08-28T13:44:31+12:00 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/ghtml.pm
r15894 r17058 161 161 162 162 163 # returns the character as a raw utf-8 character. It assumes that the 164 # & and ; have been stripped off the string. 165 sub getcharequiv { 166 my ($entity, $convertsymbols) = @_; 167 168 # a numeric entity 169 if ($entity =~ /^\#0*(\d+)/) { 170 my $code=$1; 171 # non-standard Microsoft breakage, as usual 172 if ($code < 0x9f) { # code page 1252 uses reserved bytes 173 if ($code == 0x91) {$code=0x2018} # 145 = single left quote 174 elsif ($code == 0x92) {$code=0x2019} # 146 = single right quote 175 elsif ($code == 0x93) {$code=0x201c} # 147 = double left quote 176 elsif ($code == 0x94) {$code=0x201d} # 148 = double right quote 177 # ... 178 } 179 return &unicode::unicode2utf8([$code]); 180 } 181 182 # a named character entity 183 if (defined $charnetosf{$entity}) { 184 return &unicode::unicode2utf8([$charnetosf{$entity}]); 185 } 186 187 # a named symbol entity 188 if ($convertsymbols && defined $symnetosf{$entity}) { 189 return &unicode::unicode2utf8([$symnetosf{$entity}]); 190 } 191 192 return "&$entity;"; # unknown character 193 } 194 195 # convert character entities from named equivalents to html font 196 sub convertcharentities { 197 # args: the text that you want to convert 198 199 $_[0] =~ s/&([^;]+);/&getcharequiv($1,0)/gse; 200 } 201 202 # convert any entities from named equivalents to html font 203 sub convertallentities { 204 # args: the text that you want to convert 205 206 $_[0] =~ s/&([^;]+);/&getcharequiv($1,1)/gse; 207 } 208 209 sub html2txt { 210 # args: the text that you want converted to ascii, 211 # and whether to strip out sgml tags 212 213 # strip out sgml tags if needed 214 $_[0] =~ s/<[^>]*>//g if $_[1]; 215 216 # convert the char entities to the standard html font 217 &convertcharentities($_[0]); 218 219 # convert the html character set to a plain ascii character set 220 my $pos = 0; 221 while ($pos < length($_[0])) { 222 my $charnum = ord(substr($_[0], $pos, 1)); 223 if ($charnum >= 32) { # only convert characters above #32 224 my $replacechars = " "; 225 $replacechars = $sftotxt{$charnum} if defined $sftotxt{$charnum}; 226 substr($_[0], $pos, 1) = $replacechars; 227 $pos += length ($replacechars); 228 229 } else { 230 $pos ++; 231 } 232 } 233 } 234 235 236 # look for mime.types (eg in /etc, or apache/conf directories), or have a look 237 # at <ftp://ftp.iana.org/in-notes/iana/assignments/media-types/> for defaults. 238 sub guess_mime_type { 239 my ($filename) = @_; 240 241 my ($fileext) = $filename =~ /\.(\w+)$/; 242 return "unknown" unless defined $fileext; 243 244 my %mime_type = ("ai"=>"application/postscript", "aif"=>"audio/x-aiff", 163 my %mime_type = ("ai"=>"application/postscript", "aif"=>"audio/x-aiff", 245 164 "aifc"=>"audio/x-aiff", "aiff"=>"audio/x-aiff", 246 165 "au"=>"audio/basic", "avi"=>"video/x-msvideo", … … 298 217 "xyz"=>"chemical/x-pdb", "zip"=>"application/zip"); 299 218 300 return $mime_type{$fileext} if (defined $mime_type{$fileext}); 219 220 # returns the character as a raw utf-8 character. It assumes that the 221 # & and ; have been stripped off the string. 222 sub getcharequiv { 223 my ($entity, $convertsymbols) = @_; 224 225 # a numeric entity 226 if ($entity =~ /^\#0*(\d+)/) { 227 my $code=$1; 228 # non-standard Microsoft breakage, as usual 229 if ($code < 0x9f) { # code page 1252 uses reserved bytes 230 if ($code == 0x91) {$code=0x2018} # 145 = single left quote 231 elsif ($code == 0x92) {$code=0x2019} # 146 = single right quote 232 elsif ($code == 0x93) {$code=0x201c} # 147 = double left quote 233 elsif ($code == 0x94) {$code=0x201d} # 148 = double right quote 234 # ... 235 } 236 return &unicode::unicode2utf8([$code]); 237 } 238 239 # a named character entity 240 if (defined $charnetosf{$entity}) { 241 return &unicode::unicode2utf8([$charnetosf{$entity}]); 242 } 243 244 # a named symbol entity 245 if ($convertsymbols && defined $symnetosf{$entity}) { 246 return &unicode::unicode2utf8([$symnetosf{$entity}]); 247 } 248 249 return "&$entity;"; # unknown character 250 } 251 252 # convert character entities from named equivalents to html font 253 sub convertcharentities { 254 # args: the text that you want to convert 255 256 $_[0] =~ s/&([^;]+);/&getcharequiv($1,0)/gse; 257 } 258 259 # convert any entities from named equivalents to html font 260 sub convertallentities { 261 # args: the text that you want to convert 262 263 $_[0] =~ s/&([^;]+);/&getcharequiv($1,1)/gse; 264 } 265 266 sub html2txt { 267 # args: the text that you want converted to ascii, 268 # and whether to strip out sgml tags 269 270 # strip out sgml tags if needed 271 $_[0] =~ s/<[^>]*>//g if $_[1]; 272 273 # convert the char entities to the standard html font 274 &convertcharentities($_[0]); 275 276 # convert the html character set to a plain ascii character set 277 my $pos = 0; 278 while ($pos < length($_[0])) { 279 my $charnum = ord(substr($_[0], $pos, 1)); 280 if ($charnum >= 32) { # only convert characters above #32 281 my $replacechars = " "; 282 $replacechars = $sftotxt{$charnum} if defined $sftotxt{$charnum}; 283 substr($_[0], $pos, 1) = $replacechars; 284 $pos += length ($replacechars); 285 286 } else { 287 $pos ++; 288 } 289 } 290 } 291 292 293 # look for mime.types (eg in /etc, or apache/conf directories), or have a look 294 # at <ftp://ftp.iana.org/in-notes/iana/assignments/media-types/> for defaults. 295 sub guess_mime_type { 296 my ($filename) = @_; 297 # make the filename lowercase, since the mimetypes hashmap looks for lowercase 298 $filename = lc($filename); 299 300 my ($fileext) = $filename =~ /\.(\w+)$/; 301 return "unknown" unless defined $fileext; 302 303 # else 304 my $mimetype = $mime_type{$fileext}; 305 return $mimetype if (defined $mimetype); 301 306 302 307 return "unknown";
Note:
See TracChangeset
for help on using the changeset viewer.