Ignore:
Timestamp:
2008-08-28T13:44:31+12:00 (16 years ago)
Author:
ak19
Message:
  1. Moved the mime_type hashmap out of the guess_mime_type subroutine since this hashmap might get recreated everytime that subroutine is called. 2. Converted the filename from which the mimetype is guessed into lowercase first since the mime_type hashmap only contains lowercase file extension.
File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/ghtml.pm

    r15894 r17058  
    161161
    162162
    163 # returns the character as a raw utf-8 character. It assumes that the
    164 # & and ; have been stripped off the string.
    165 sub getcharequiv {
    166     my ($entity, $convertsymbols) = @_;
    167 
    168     # a numeric entity
    169     if ($entity =~ /^\#0*(\d+)/) {
    170     my $code=$1;
    171     # non-standard Microsoft breakage, as usual
    172     if ($code < 0x9f) { # code page 1252 uses reserved bytes
    173         if ($code == 0x91) {$code=0x2018} # 145 = single left quote
    174         elsif ($code == 0x92) {$code=0x2019} # 146 = single right quote
    175         elsif ($code == 0x93) {$code=0x201c} # 147 = double left quote
    176         elsif ($code == 0x94) {$code=0x201d} # 148 = double right quote
    177         # ...
    178     }
    179     return &unicode::unicode2utf8([$code]);
    180     }
    181    
    182     # a named character entity
    183     if (defined $charnetosf{$entity}) {
    184     return &unicode::unicode2utf8([$charnetosf{$entity}]);
    185     }
    186 
    187     # a named symbol entity
    188     if ($convertsymbols && defined $symnetosf{$entity}) {
    189     return &unicode::unicode2utf8([$symnetosf{$entity}]);
    190     }
    191 
    192     return "&$entity;"; # unknown character
    193 }
    194 
    195 # convert character entities from named equivalents to html font
    196 sub convertcharentities {
    197     # args: the text that you want to convert
    198 
    199     $_[0] =~ s/&([^;]+);/&getcharequiv($1,0)/gse;
    200 }
    201 
    202 # convert any entities from named equivalents to html font
    203 sub convertallentities {
    204     # args: the text that you want to convert
    205 
    206     $_[0] =~ s/&([^;]+);/&getcharequiv($1,1)/gse;
    207 }
    208 
    209 sub html2txt {
    210     # args: the text that you want converted to ascii,
    211     # and whether to strip out sgml tags
    212 
    213     # strip out sgml tags if needed
    214     $_[0] =~ s/<[^>]*>//g if $_[1];
    215 
    216     # convert the char entities to the standard html font
    217     &convertcharentities($_[0]);
    218    
    219     # convert the html character set to a plain ascii character set
    220     my $pos = 0;
    221     while ($pos < length($_[0])) {
    222     my $charnum = ord(substr($_[0], $pos, 1));
    223     if ($charnum >= 32) { # only convert characters above #32
    224         my $replacechars = " ";
    225         $replacechars = $sftotxt{$charnum} if defined $sftotxt{$charnum};
    226         substr($_[0], $pos, 1) = $replacechars;
    227         $pos += length ($replacechars);
    228 
    229     } else {
    230         $pos ++;
    231     }
    232     }
    233 
    234 
    235 
    236 # look for mime.types (eg in /etc, or apache/conf directories), or have a look
    237 # at <ftp://ftp.iana.org/in-notes/iana/assignments/media-types/> for defaults.
    238 sub guess_mime_type {
    239     my ($filename) = @_;
    240 
    241     my ($fileext) = $filename =~ /\.(\w+)$/;
    242     return "unknown" unless defined $fileext;
    243 
    244     my %mime_type = ("ai"=>"application/postscript", "aif"=>"audio/x-aiff",
     163my %mime_type = ("ai"=>"application/postscript", "aif"=>"audio/x-aiff",
    245164             "aifc"=>"audio/x-aiff", "aiff"=>"audio/x-aiff",
    246165             "au"=>"audio/basic", "avi"=>"video/x-msvideo",
     
    298217             "xyz"=>"chemical/x-pdb", "zip"=>"application/zip");
    299218
    300     return $mime_type{$fileext} if (defined $mime_type{$fileext});
     219
     220# returns the character as a raw utf-8 character. It assumes that the
     221# & and ; have been stripped off the string.
     222sub getcharequiv {
     223    my ($entity, $convertsymbols) = @_;
     224
     225    # a numeric entity
     226    if ($entity =~ /^\#0*(\d+)/) {
     227    my $code=$1;
     228    # non-standard Microsoft breakage, as usual
     229    if ($code < 0x9f) { # code page 1252 uses reserved bytes
     230        if ($code == 0x91) {$code=0x2018} # 145 = single left quote
     231        elsif ($code == 0x92) {$code=0x2019} # 146 = single right quote
     232        elsif ($code == 0x93) {$code=0x201c} # 147 = double left quote
     233        elsif ($code == 0x94) {$code=0x201d} # 148 = double right quote
     234        # ...
     235    }
     236    return &unicode::unicode2utf8([$code]);
     237    }
     238   
     239    # a named character entity
     240    if (defined $charnetosf{$entity}) {
     241    return &unicode::unicode2utf8([$charnetosf{$entity}]);
     242    }
     243
     244    # a named symbol entity
     245    if ($convertsymbols && defined $symnetosf{$entity}) {
     246    return &unicode::unicode2utf8([$symnetosf{$entity}]);
     247    }
     248
     249    return "&$entity;"; # unknown character
     250}
     251
     252# convert character entities from named equivalents to html font
     253sub convertcharentities {
     254    # args: the text that you want to convert
     255
     256    $_[0] =~ s/&([^;]+);/&getcharequiv($1,0)/gse;
     257}
     258
     259# convert any entities from named equivalents to html font
     260sub convertallentities {
     261    # args: the text that you want to convert
     262
     263    $_[0] =~ s/&([^;]+);/&getcharequiv($1,1)/gse;
     264}
     265
     266sub html2txt {
     267    # args: the text that you want converted to ascii,
     268    # and whether to strip out sgml tags
     269
     270    # strip out sgml tags if needed
     271    $_[0] =~ s/<[^>]*>//g if $_[1];
     272
     273    # convert the char entities to the standard html font
     274    &convertcharentities($_[0]);
     275   
     276    # convert the html character set to a plain ascii character set
     277    my $pos = 0;
     278    while ($pos < length($_[0])) {
     279    my $charnum = ord(substr($_[0], $pos, 1));
     280    if ($charnum >= 32) { # only convert characters above #32
     281        my $replacechars = " ";
     282        $replacechars = $sftotxt{$charnum} if defined $sftotxt{$charnum};
     283        substr($_[0], $pos, 1) = $replacechars;
     284        $pos += length ($replacechars);
     285
     286    } else {
     287        $pos ++;
     288    }
     289    }
     290
     291
     292
     293# look for mime.types (eg in /etc, or apache/conf directories), or have a look
     294# at <ftp://ftp.iana.org/in-notes/iana/assignments/media-types/> for defaults.
     295sub guess_mime_type {
     296    my ($filename) = @_;
     297    # make the filename lowercase, since the mimetypes hashmap looks for lowercase
     298    $filename = lc($filename);
     299
     300    my ($fileext) = $filename =~ /\.(\w+)$/;
     301    return "unknown" unless defined $fileext;
     302
     303    # else
     304    my $mimetype =  $mime_type{$fileext};
     305    return $mimetype if (defined $mimetype);
    301306
    302307    return "unknown";
Note: See TracChangeset for help on using the changeset viewer.