Context Navigation

← Previous Change
Next Change →

ghtml.pm

Timestamp:

2008-08-28T13:44:31+12:00 (16 years ago)

Author:

ak19

Message:

Moved the mime_type hashmap out of the guess_mime_type subroutine since this hashmap might get recreated everytime that subroutine is called. 2. Converted the filename from which the mimetype is guessed into lowercase first since the mime_type hashmap only contains lowercase file extension.

File:

: 1 edited

gsdl/trunk/perllib/ghtml.pm (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

gsdl/trunk/perllib/ghtml.pm

-              r15894
+              r17058
+# returns the character as a raw utf-8 character. It assumes that the
+# & and ; have been stripped off the string.
+sub getcharequiv {
+    my ($entity, $convertsymbols) = @_;
+    # a numeric entity
+    if ($entity =~ /^\#0*(\d+)/) {
+    my $code=$1;
+    # non-standard Microsoft breakage, as usual
+    if ($code < 0x9f) { # code page 1252 uses reserved bytes
+        if ($code == 0x91) {$code=0x2018} # 145 = single left quote
+        elsif ($code == 0x92) {$code=0x2019} # 146 = single right quote
+        elsif ($code == 0x93) {$code=0x201c} # 147 = double left quote
+        elsif ($code == 0x94) {$code=0x201d} # 148 = double right quote
+        # ...
+    }
+    return &unicode::unicode2utf8([$code]);
+    }
+    # a named character entity
+    if (defined $charnetosf{$entity}) {
+    return &unicode::unicode2utf8([$charnetosf{$entity}]);
+    }
+    # a named symbol entity
+    if ($convertsymbols && defined $symnetosf{$entity}) {
+    return &unicode::unicode2utf8([$symnetosf{$entity}]);
+    }
+    return "&$entity;"; # unknown character
+}
+# convert character entities from named equivalents to html font
+sub convertcharentities {
+    # args: the text that you want to convert
+    $_[0] =~ s/&([^;]+);/&getcharequiv($1,0)/gse;
+}
+# convert any entities from named equivalents to html font
+sub convertallentities {
+    # args: the text that you want to convert
+    $_[0] =~ s/&([^;]+);/&getcharequiv($1,1)/gse;
+}
+sub html2txt {
+    # args: the text that you want converted to ascii,
+    # and whether to strip out sgml tags
+    # strip out sgml tags if needed
+    $_[0] =~ s/<[^>]*>//g if $_[1];
+    # convert the char entities to the standard html font
+    &convertcharentities($_[0]);
+    # convert the html character set to a plain ascii character set
+    my $pos = 0;
+    while ($pos < length($_[0])) {
+    my $charnum = ord(substr($_[0], $pos, 1));
+    if ($charnum >= 32) { # only convert characters above #32
+        my $replacechars = " ";
+        $replacechars = $sftotxt{$charnum} if defined $sftotxt{$charnum};
+        substr($_[0], $pos, 1) = $replacechars;
+        $pos += length ($replacechars);
+    } else {
+        $pos ++;
+    }
+    }
+}
+# look for mime.types (eg in /etc, or apache/conf directories), or have a look
+# at <ftp://ftp.iana.org/in-notes/iana/assignments/media-types/> for defaults.
+sub guess_mime_type {
+    my ($filename) = @_;
+    my ($fileext) = $filename =~ /\.(\w+)$/;
+    return "unknown" unless defined $fileext;
+    my %mime_type = ("ai"=>"application/postscript", "aif"=>"audio/x-aiff",
+my %mime_type = ("ai"=>"application/postscript", "aif"=>"audio/x-aiff",
              "aifc"=>"audio/x-aiff", "aiff"=>"audio/x-aiff",
              "au"=>"audio/basic", "avi"=>"video/x-msvideo",
 …
              "xyz"=>"chemical/x-pdb", "zip"=>"application/zip");
+    return $mime_type{$fileext} if (defined $mime_type{$fileext});
+# returns the character as a raw utf-8 character. It assumes that the
+# & and ; have been stripped off the string.
+sub getcharequiv {
+    my ($entity, $convertsymbols) = @_;
+    # a numeric entity
+    if ($entity =~ /^\#0*(\d+)/) {
+    my $code=$1;
+    # non-standard Microsoft breakage, as usual
+    if ($code < 0x9f) { # code page 1252 uses reserved bytes
+        if ($code == 0x91) {$code=0x2018} # 145 = single left quote
+        elsif ($code == 0x92) {$code=0x2019} # 146 = single right quote
+        elsif ($code == 0x93) {$code=0x201c} # 147 = double left quote
+        elsif ($code == 0x94) {$code=0x201d} # 148 = double right quote
+        # ...
+    }
+    return &unicode::unicode2utf8([$code]);
+    }
+    # a named character entity
+    if (defined $charnetosf{$entity}) {
+    return &unicode::unicode2utf8([$charnetosf{$entity}]);
+    }
+    # a named symbol entity
+    if ($convertsymbols && defined $symnetosf{$entity}) {
+    return &unicode::unicode2utf8([$symnetosf{$entity}]);
+    }
+    return "&$entity;"; # unknown character
+}
+# convert character entities from named equivalents to html font
+sub convertcharentities {
+    # args: the text that you want to convert
+    $_[0] =~ s/&([^;]+);/&getcharequiv($1,0)/gse;
+}
+# convert any entities from named equivalents to html font
+sub convertallentities {
+    # args: the text that you want to convert
+    $_[0] =~ s/&([^;]+);/&getcharequiv($1,1)/gse;
+}
+sub html2txt {
+    # args: the text that you want converted to ascii,
+    # and whether to strip out sgml tags
+    # strip out sgml tags if needed
+    $_[0] =~ s/<[^>]*>//g if $_[1];
+    # convert the char entities to the standard html font
+    &convertcharentities($_[0]);
+    # convert the html character set to a plain ascii character set
+    my $pos = 0;
+    while ($pos < length($_[0])) {
+    my $charnum = ord(substr($_[0], $pos, 1));
+    if ($charnum >= 32) { # only convert characters above #32
+        my $replacechars = " ";
+        $replacechars = $sftotxt{$charnum} if defined $sftotxt{$charnum};
+        substr($_[0], $pos, 1) = $replacechars;
+        $pos += length ($replacechars);
+    } else {
+        $pos ++;
+    }
+    }
+}
+# look for mime.types (eg in /etc, or apache/conf directories), or have a look
+# at <ftp://ftp.iana.org/in-notes/iana/assignments/media-types/> for defaults.
+sub guess_mime_type {
+    my ($filename) = @_;
+    # make the filename lowercase, since the mimetypes hashmap looks for lowercase
+    $filename = lc($filename);
+    my ($fileext) = $filename =~ /\.(\w+)$/;
+    return "unknown" unless defined $fileext;
+    # else
+    my $mimetype =  $mime_type{$fileext};
+    return $mimetype if (defined $mimetype);
     return "unknown";

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 17058 for gsdl/trunk/perllib/ghtml.pm

Legend:

gsdl/trunk/perllib/ghtml.pm

Download in other formats: