Context Navigation

← Previous Changeset
Next Changeset →

Changeset 73

Timestamp:

1998-12-11T20:59:16+13:00 (25 years ago)

Author:

rjmcnab

Message:

Added support for UTF-8.

Location:

trunk/gsdl/perllib

Files:

: 3 edited

doc.pm (modified) (5 diffs)
gb.pm (modified) (2 diffs)
unicode.pm (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/doc.pm

-              r65
+              r73
 package doc;
+use unicode;
 use util;
 use html;
 …
 # are for metadata which can have more than one value.
+# set_metadata_element assumes the value is in (extended) ascii form.
+# For text which hash been already converted to the UTF-8 format use
+# set_utf8_metadata_element.
 sub set_metadata_element {
     my $self = shift (@_);
     my ($section, $field, $value) = @_;
+    $self->set_utf8_metadata_element ($section, $field,
+                      &unicode::ascii2utf8($value));
+}
+# set_utf8_metadata_element assumes the text has already been
+# converted to the UTF-8 encoding.
+sub set_utf8_metadata_element {
+    my $self = shift (@_);
+    my ($section, $field, $value) = @_;
     $self->delete_metadata ($section, $field);
     $self->add_metadata ($section, $field, $value);
+    $self->add_utf8_metadata ($section, $field, $value);
+}
 …
+}
+# add_metadata assumes the text is in (extended) ascii form. For
+# text which hash been already converted to the UTF-8 format use
+# add_utf8_metadata.
 sub add_metadata {
     my $self = shift (@_);
     my ($section, $field, $value) = @_;
+    my $section_ptr = $self->_lookup_section($section);
+    if (!defined $section_ptr) {
+    print STDERR "doc::add_metadata couldn't find section " .
+    $self->add_utf8_metadata ($section, $field,
+                  &unicode::ascii2utf8($value));
+}
+# add_utf8_metadata assumes the text has already been converted
+# to the UTF-8 encoding.
+sub add_utf8_metadata {
+    my $self = shift (@_);
+    my ($section, $field, $value) = @_;
+    my $section_ptr = $self->_lookup_section($section);
+    if (!defined $section_ptr) {
+    print STDERR "doc::add_utf8_metadata couldn't find section " .
         "$section\n";
     return;
 …
 # methods for dealing with text
+# add_text assumes the text is in (extended) ascii form. For
+# text which has been already converted to the UTF-8 format
+# use add_utf8_text.
 sub add_text {
     my $self = shift (@_);
     my ($section, $text) = @_;
+    my $section_ptr = $self->_lookup_section($section);
+    if (!defined $section_ptr) {
+    print STDERR "doc::add_text couldn't find section " .
+    # convert the text to UTF-8 encoded unicode characters
+    # and add the text
+    $self->add_utf8_text($section, &unicode::ascii2utf8($text));
+}
+# add_utf8_text assumes the text to be added has already
+# been converted to the UTF-8 encoding. For ascii text use
+# add_text
+sub add_utf8_text {
+    my $self = shift (@_);
+    my ($section, $text) = @_;
+    my $section_ptr = $self->_lookup_section($section);
+    if (!defined $section_ptr) {
+    print STDERR "doc::add_utf8_text couldn't find section " .
         "$section\n";
     return;
 …
+}
 # returns the length of the text for a section
+# returns the (utf-8 encoded) length of the text for a section
 sub get_text_length {
     my $self = shift (@_);

trunk/gsdl/perllib/gb.pm

-              r71
+              r73
     while ($i < $len) {
+    if (($c1 = ord(substr($intext, $i, 1))) > 127) {
+        if ($i+1 < $len && ($c2 = ord(substr($intext, $i+1, 1))) > 127) {
+        # found a GB character, put black square if cannot translate
+#       printf "in: %x\n", (($c1-128)*256+$c2-128);
+        my $c = &transchar ($encodename, ($c1-128)*256+$c2-128);
+        $c = 0x25a1 if $c == 0;
+        push (@outtext, $c);
+#       printf "out: %x\n", $c;
+        $i += 2;
+    if (($c1 = ord(substr($intext, $i, 1))) >= 0xa0) {
+        if ($i+1 < $len) {
+        if (($c2 = ord(substr($intext, $i+1, 1))) >= 0xa0) {
+            # found a GB character, put black square if cannot translate
+            my $c = &transchar ($encodename, ($c1-128)*256+$c2-128);
+            $c = 0x25a1 if $c == 0;
+            push (@outtext, $c);
+            $i += 2;
+        } else {
+            # error second character not > 127
+            print STDERR "gb: ERROR second GB character not >= 0xa0\n";
+            $i++;
+        }
         } else {
         # error
         print STDERR "gb: ERROR in gb encoding\n";
+        print STDERR "gb: ERROR missing second half of GB character\n";
         $i++;
+        }
 …
     } else {
         # normal ascii character
         push (@outtext, $c1);
+        push (@outtext, $c1) if ($c1 < 0x80);
         $i++;
+    }

trunk/gsdl/perllib/unicode.pm

-              r71
+              r73
 package unicode;
+# ascii2unicode takes a (extended) ascii string and
+# returns a unicode array.
+sub ascii2unicode {
+    my ($in) = @_;
+    my $out = [];
+    my $i = 0;
+    my $len = length($in);
+    while ($i < $len) {
+    push (@$out, ord(substr ($in, $i, 1)));
+    $i++;
+    }
+    return $out;
+}
+# ascii2utf8 takes a (extended) ascii string and
+# returns a UTF-8 encoded string. This is just
+# a faster version of "&unicode2utf8(&ascii2unicode($str));"
+sub ascii2utf8 {
+    my ($in) = @_;
+    my $out = "";
+    my ($c);
+    my $i = 0;
+    my $len = length($in);
+    while ($i < $len) {
+    $c = ord (substr ($in, $i, 1));
+    if ($c < 0x80) {
+        # ascii character
+        $out .= chr ($c);
+    } else {
+        # extended ascii character
+        $out .= chr (0xc0 + (($c >> 6) & 0x1f));
+        $out .= chr (0x80 + ($c & 0x3f));
+    }
+    $i++;
+    }
+    return $out;
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 73

Legend:

trunk/gsdl/perllib/doc.pm

trunk/gsdl/perllib/gb.pm

trunk/gsdl/perllib/unicode.pm

Download in other formats: