Changeset 73


Ignore:
Timestamp:
1998-12-11T20:59:16+13:00 (25 years ago)
Author:
rjmcnab
Message:

Added support for UTF-8.

Location:
trunk/gsdl/perllib
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/doc.pm

    r65 r73  
    33package doc;
    44
     5use unicode;
    56use util;
    67use html;
     
    463464# are for metadata which can have more than one value.
    464465
     466# set_metadata_element assumes the value is in (extended) ascii form.
     467# For text which hash been already converted to the UTF-8 format use
     468# set_utf8_metadata_element.
    465469sub set_metadata_element {
    466470    my $self = shift (@_);
    467471    my ($section, $field, $value) = @_;
    468472
     473    $self->set_utf8_metadata_element ($section, $field,
     474                      &unicode::ascii2utf8($value));
     475}
     476
     477# set_utf8_metadata_element assumes the text has already been
     478# converted to the UTF-8 encoding.
     479sub set_utf8_metadata_element {
     480    my $self = shift (@_);
     481    my ($section, $field, $value) = @_;
     482
    469483    $self->delete_metadata ($section, $field);
    470     $self->add_metadata ($section, $field, $value);
     484    $self->add_utf8_metadata ($section, $field, $value);
    471485}
    472486
     
    491505}
    492506
     507# add_metadata assumes the text is in (extended) ascii form. For
     508# text which hash been already converted to the UTF-8 format use
     509# add_utf8_metadata.
    493510sub add_metadata {
    494511    my $self = shift (@_);
    495512    my ($section, $field, $value) = @_;
    496513
    497     my $section_ptr = $self->_lookup_section($section);
    498     if (!defined $section_ptr) {
    499     print STDERR "doc::add_metadata couldn't find section " .
     514    $self->add_utf8_metadata ($section, $field,
     515                  &unicode::ascii2utf8($value));
     516}
     517
     518# add_utf8_metadata assumes the text has already been converted
     519# to the UTF-8 encoding.
     520sub add_utf8_metadata {
     521    my $self = shift (@_);
     522    my ($section, $field, $value) = @_;
     523
     524    my $section_ptr = $self->_lookup_section($section);
     525    if (!defined $section_ptr) {
     526    print STDERR "doc::add_utf8_metadata couldn't find section " .
    500527        "$section\n";
    501528    return;
     
    618645# methods for dealing with text
    619646
     647# add_text assumes the text is in (extended) ascii form. For
     648# text which has been already converted to the UTF-8 format
     649# use add_utf8_text.
    620650sub add_text {
    621651    my $self = shift (@_);
    622652    my ($section, $text) = @_;
    623653
    624     my $section_ptr = $self->_lookup_section($section);
    625     if (!defined $section_ptr) {
    626     print STDERR "doc::add_text couldn't find section " .
     654    # convert the text to UTF-8 encoded unicode characters
     655    # and add the text
     656    $self->add_utf8_text($section, &unicode::ascii2utf8($text));
     657}
     658
     659
     660# add_utf8_text assumes the text to be added has already
     661# been converted to the UTF-8 encoding. For ascii text use
     662# add_text
     663sub add_utf8_text {
     664    my $self = shift (@_);
     665    my ($section, $text) = @_;
     666
     667    my $section_ptr = $self->_lookup_section($section);
     668    if (!defined $section_ptr) {
     669    print STDERR "doc::add_utf8_text couldn't find section " .
    627670        "$section\n";
    628671    return;
     
    647690}
    648691
    649 # returns the length of the text for a section
     692# returns the (utf-8 encoded) length of the text for a section
    650693sub get_text_length {
    651694    my $self = shift (@_);
  • trunk/gsdl/perllib/gb.pm

    r71 r73  
    6262
    6363    while ($i < $len) {
    64     if (($c1 = ord(substr($intext, $i, 1))) > 127) {
    65         if ($i+1 < $len && ($c2 = ord(substr($intext, $i+1, 1))) > 127) {
    66         # found a GB character, put black square if cannot translate
    67 #       printf "in: %x\n", (($c1-128)*256+$c2-128);
    68         my $c = &transchar ($encodename, ($c1-128)*256+$c2-128);
    69         $c = 0x25a1 if $c == 0;
    70         push (@outtext, $c);
    71 #       printf "out: %x\n", $c;
    72         $i += 2;
    73 
     64    if (($c1 = ord(substr($intext, $i, 1))) >= 0xa0) {
     65        if ($i+1 < $len) {
     66        if (($c2 = ord(substr($intext, $i+1, 1))) >= 0xa0) {
     67            # found a GB character, put black square if cannot translate
     68            my $c = &transchar ($encodename, ($c1-128)*256+$c2-128);
     69            $c = 0x25a1 if $c == 0;
     70            push (@outtext, $c);
     71            $i += 2;
     72           
     73        } else {
     74            # error second character not > 127
     75            print STDERR "gb: ERROR second GB character not >= 0xa0\n";
     76            $i++;
     77           
     78        }
    7479        } else {
    7580        # error
    76         print STDERR "gb: ERROR in gb encoding\n";
     81        print STDERR "gb: ERROR missing second half of GB character\n";
    7782        $i++;
    7883        }
     
    8085    } else {
    8186        # normal ascii character
    82         push (@outtext, $c1);
     87        push (@outtext, $c1) if ($c1 < 0x80);
    8388        $i++;
    8489    }
  • trunk/gsdl/perllib/unicode.pm

    r71 r73  
    55
    66package unicode;
     7
     8
     9
     10# ascii2unicode takes a (extended) ascii string and
     11# returns a unicode array.
     12sub ascii2unicode {
     13    my ($in) = @_;
     14    my $out = [];
     15
     16    my $i = 0;
     17    my $len = length($in);
     18    while ($i < $len) {
     19    push (@$out, ord(substr ($in, $i, 1)));
     20    $i++;
     21    }
     22
     23    return $out;
     24}
     25
     26
     27# ascii2utf8 takes a (extended) ascii string and
     28# returns a UTF-8 encoded string. This is just
     29# a faster version of "&unicode2utf8(&ascii2unicode($str));"
     30sub ascii2utf8 {
     31    my ($in) = @_;
     32    my $out = "";
     33
     34    my ($c);
     35    my $i = 0;
     36    my $len = length($in);
     37    while ($i < $len) {
     38    $c = ord (substr ($in, $i, 1));
     39    if ($c < 0x80) {
     40        # ascii character
     41        $out .= chr ($c);
     42
     43    } else {
     44        # extended ascii character
     45        $out .= chr (0xc0 + (($c >> 6) & 0x1f));
     46        $out .= chr (0x80 + ($c & 0x3f));
     47    }
     48    $i++;
     49    }
     50
     51    return $out;
     52}
    753
    854
Note: See TracChangeset for help on using the changeset viewer.