Changeset 17058

Show
Ignore:
Timestamp:
28.08.2008 13:44:31 (11 years ago)
Author:
ak19
Message:

1. Moved the mime_type hashmap out of the guess_mime_type subroutine since this hashmap might get recreated everytime that subroutine is called. 2. Converted the filename from which the mimetype is guessed into lowercase first since the mime_type hashmap only contains lowercase file extension.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/ghtml.pm

    r15894 r17058  
    161161 
    162162 
    163 # returns the character as a raw utf-8 character. It assumes that the 
    164 # & and ; have been stripped off the string. 
    165 sub getcharequiv { 
    166     my ($entity, $convertsymbols) = @_; 
    167  
    168     # a numeric entity 
    169     if ($entity =~ /^\#0*(\d+)/) { 
    170     my $code=$1; 
    171     # non-standard Microsoft breakage, as usual 
    172     if ($code < 0x9f) { # code page 1252 uses reserved bytes 
    173         if ($code == 0x91) {$code=0x2018} # 145 = single left quote 
    174         elsif ($code == 0x92) {$code=0x2019} # 146 = single right quote 
    175         elsif ($code == 0x93) {$code=0x201c} # 147 = double left quote 
    176         elsif ($code == 0x94) {$code=0x201d} # 148 = double right quote 
    177         # ... 
    178     } 
    179     return &unicode::unicode2utf8([$code]); 
    180     } 
    181      
    182     # a named character entity 
    183     if (defined $charnetosf{$entity}) { 
    184     return &unicode::unicode2utf8([$charnetosf{$entity}]); 
    185     } 
    186  
    187     # a named symbol entity 
    188     if ($convertsymbols && defined $symnetosf{$entity}) { 
    189     return &unicode::unicode2utf8([$symnetosf{$entity}]); 
    190     } 
    191  
    192     return "&$entity;"; # unknown character 
    193 } 
    194  
    195 # convert character entities from named equivalents to html font 
    196 sub convertcharentities { 
    197     # args: the text that you want to convert 
    198  
    199     $_[0] =~ s/&([^;]+);/&getcharequiv($1,0)/gse; 
    200 } 
    201  
    202 # convert any entities from named equivalents to html font 
    203 sub convertallentities { 
    204     # args: the text that you want to convert 
    205  
    206     $_[0] =~ s/&([^;]+);/&getcharequiv($1,1)/gse; 
    207 } 
    208  
    209 sub html2txt { 
    210     # args: the text that you want converted to ascii,  
    211     # and whether to strip out sgml tags 
    212  
    213     # strip out sgml tags if needed 
    214     $_[0] =~ s/<[^>]*>//g if $_[1]; 
    215  
    216     # convert the char entities to the standard html font 
    217     &convertcharentities($_[0]);  
    218      
    219     # convert the html character set to a plain ascii character set 
    220     my $pos = 0; 
    221     while ($pos < length($_[0])) { 
    222     my $charnum = ord(substr($_[0], $pos, 1)); 
    223     if ($charnum >= 32) { # only convert characters above #32 
    224         my $replacechars = " "; 
    225         $replacechars = $sftotxt{$charnum} if defined $sftotxt{$charnum}; 
    226         substr($_[0], $pos, 1) = $replacechars; 
    227         $pos += length ($replacechars); 
    228  
    229     } else { 
    230         $pos ++; 
    231     } 
    232     } 
    233  
    234  
    235  
    236 # look for mime.types (eg in /etc, or apache/conf directories), or have a look 
    237 # at <ftp://ftp.iana.org/in-notes/iana/assignments/media-types/> for defaults. 
    238 sub guess_mime_type { 
    239     my ($filename) = @_; 
    240  
    241     my ($fileext) = $filename =~ /\.(\w+)$/; 
    242     return "unknown" unless defined $fileext; 
    243  
    244     my %mime_type = ("ai"=>"application/postscript", "aif"=>"audio/x-aiff",  
     163my %mime_type = ("ai"=>"application/postscript", "aif"=>"audio/x-aiff",  
    245164             "aifc"=>"audio/x-aiff", "aiff"=>"audio/x-aiff",  
    246165             "au"=>"audio/basic", "avi"=>"video/x-msvideo", 
     
    298217             "xyz"=>"chemical/x-pdb", "zip"=>"application/zip"); 
    299218 
    300     return $mime_type{$fileext} if (defined $mime_type{$fileext}); 
     219 
     220# returns the character as a raw utf-8 character. It assumes that the 
     221# & and ; have been stripped off the string. 
     222sub getcharequiv { 
     223    my ($entity, $convertsymbols) = @_; 
     224 
     225    # a numeric entity 
     226    if ($entity =~ /^\#0*(\d+)/) { 
     227    my $code=$1; 
     228    # non-standard Microsoft breakage, as usual 
     229    if ($code < 0x9f) { # code page 1252 uses reserved bytes 
     230        if ($code == 0x91) {$code=0x2018} # 145 = single left quote 
     231        elsif ($code == 0x92) {$code=0x2019} # 146 = single right quote 
     232        elsif ($code == 0x93) {$code=0x201c} # 147 = double left quote 
     233        elsif ($code == 0x94) {$code=0x201d} # 148 = double right quote 
     234        # ... 
     235    } 
     236    return &unicode::unicode2utf8([$code]); 
     237    } 
     238     
     239    # a named character entity 
     240    if (defined $charnetosf{$entity}) { 
     241    return &unicode::unicode2utf8([$charnetosf{$entity}]); 
     242    } 
     243 
     244    # a named symbol entity 
     245    if ($convertsymbols && defined $symnetosf{$entity}) { 
     246    return &unicode::unicode2utf8([$symnetosf{$entity}]); 
     247    } 
     248 
     249    return "&$entity;"; # unknown character 
     250} 
     251 
     252# convert character entities from named equivalents to html font 
     253sub convertcharentities { 
     254    # args: the text that you want to convert 
     255 
     256    $_[0] =~ s/&([^;]+);/&getcharequiv($1,0)/gse; 
     257} 
     258 
     259# convert any entities from named equivalents to html font 
     260sub convertallentities { 
     261    # args: the text that you want to convert 
     262 
     263    $_[0] =~ s/&([^;]+);/&getcharequiv($1,1)/gse; 
     264} 
     265 
     266sub html2txt { 
     267    # args: the text that you want converted to ascii,  
     268    # and whether to strip out sgml tags 
     269 
     270    # strip out sgml tags if needed 
     271    $_[0] =~ s/<[^>]*>//g if $_[1]; 
     272 
     273    # convert the char entities to the standard html font 
     274    &convertcharentities($_[0]);  
     275     
     276    # convert the html character set to a plain ascii character set 
     277    my $pos = 0; 
     278    while ($pos < length($_[0])) { 
     279    my $charnum = ord(substr($_[0], $pos, 1)); 
     280    if ($charnum >= 32) { # only convert characters above #32 
     281        my $replacechars = " "; 
     282        $replacechars = $sftotxt{$charnum} if defined $sftotxt{$charnum}; 
     283        substr($_[0], $pos, 1) = $replacechars; 
     284        $pos += length ($replacechars); 
     285 
     286    } else { 
     287        $pos ++; 
     288    } 
     289    } 
     290 
     291 
     292 
     293# look for mime.types (eg in /etc, or apache/conf directories), or have a look 
     294# at <ftp://ftp.iana.org/in-notes/iana/assignments/media-types/> for defaults. 
     295sub guess_mime_type { 
     296    my ($filename) = @_; 
     297    # make the filename lowercase, since the mimetypes hashmap looks for lowercase 
     298    $filename = lc($filename); 
     299 
     300    my ($fileext) = $filename =~ /\.(\w+)$/; 
     301    return "unknown" unless defined $fileext; 
     302 
     303    # else 
     304    my $mimetype =  $mime_type{$fileext}; 
     305    return $mimetype if (defined $mimetype); 
    301306 
    302307    return "unknown";