Ignore:
Timestamp:
2001-01-26T17:25:49+13:00 (23 years ago)
Author:
sjboddie
Message:

Made a bunch of changes to the building code to support lots of new
languages and encodings. It's still kind of a mess but should be fixed
up over the weekend.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/unicode.pm

    r1844 r1868  
    133133}
    134134
     135# iscii2unicode is basically identical to iso2unicode, the only
     136# difference being that the map files live in unicode/MAPPINGS/ISCII
     137#
     138# values for $encoding may be 'Devanagari' only at present
     139sub iscii2unicode {
     140    my ($encoding, $in) = @_;
     141    my $out = [];
     142
     143    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
     144                      "ISCII", "$encoding.txt");
     145    return $out unless &loadmapping ($encoding, $mapfile);
     146
     147    my $i = 0;
     148    my $len = length($in);
     149    while ($i < $len) {
     150    my $c = ord(substr ($in, $i, 1));
     151    $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0);
     152    push (@$out, $c);
     153    $i++;
     154    }
     155
     156    return $out;
     157}
    135158
    136159# ascii2utf8 takes a (extended) ascii string and
     
    169192   
    170193    foreach $num (@$in) {
     194    next unless defined $num;
    171195    if ($num < 0x80) {
    172196        $out .= chr ($num);
     
    326350
    327351
     352
     353
     354
     355
     356
     357
     358
     359####################################################################################################
     360
     361
     362# %translations is of the form:
     363#
     364# encodings{encodingname-encodingname}->blocktranslation
     365# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
     366#
     367# Any of the top translation blocks can point to an undefined
     368# value. This data structure aims to allow fast translation and
     369# efficient storage.
     370%translations = ();
     371
     372# @array256 is used for initialisation, there must be
     373# a better way...
     374@array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     375         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     376         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     377         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     378         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     379         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     380         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     381         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     382         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     383         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     384         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     385         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     386         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     387         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     388         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     389         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
     390
     391$encodings = {
     392    'iso_8859_1' => {'fullname' => 'Latin1 (western languages)',
     393             'mapfile' => '8859_1.ump', 'ascii_delim' => 0xA0},
     394
     395    'iso_8859_2' => {'fullname' => 'Latin2 (central and eastern european languages)',
     396             'mapfile' => '8859_2.ump', 'ascii_delim' => 0xA0},
     397
     398    'iso_8859_3' => {'fullname' => 'Latin3',
     399             'mapfile' => '8859_3.ump', 'ascii_delim' => 0xA0},
     400
     401    'iso_8859_4' => {'fullname' => 'Latin4',
     402             'mapfile' => '8859_4.ump', 'ascii_delim' => 0xA0},
     403
     404    'iso_8859_5' => {'fullname' => 'Cyrillic',
     405             'mapfile' => '8859_5.ump', 'ascii_delim' => 0xA0},
     406
     407    'iso_8859_6' => {'fullname' => 'Arabic',
     408             'mapfile' => '8859_6.ump', 'ascii_delim' => 0xA0},
     409
     410    'iso_8859_7' => {'fullname' => 'Greek',
     411             'mapfile' => '8859_7.ump', 'ascii_delim' => 0xA0},
     412
     413    'iso_8859_8' => {'fullname' => 'Hebrew',
     414             'mapfile' => '8859_8.ump', 'ascii_delim' => 0xA0},
     415
     416    'iso_8859_9' => {'fullname' => 'Latin5',
     417             'mapfile' => '8859_9.ump', 'ascii_delim' => 0xA0},
     418
     419    'windows_1250' => {'fullname' => 'Windows codepage 1250 (WinLatin2)',
     420               'mapfile' => 'win1250.ump', 'ascii_delim' => 0x80},
     421
     422    'windows_1251' => {'fullname' => 'Windows codepage 1251 (WinCyrillic)',
     423               'mapfile' => 'win1251.ump', 'ascii_delim' => 0x80},
     424
     425    'windows_1252' => {'fullname' => 'Windows codepage 1252 (WinLatin1)',
     426               'mapfile' => 'win1252.ump', 'ascii_delim' => 0x80},
     427
     428    'windows_1253' => {'fullname' => 'Windows codepage 1253 (WinGreek)',
     429               'mapfile' => 'win1253.ump', 'ascii_delim' => 0x80},
     430
     431    'windows_1254' => {'fullname' => 'Windows codepage 1254 (WinTurkish)',
     432               'mapfile' => 'win1254.ump', 'ascii_delim' => 0x80},
     433
     434    'windows_1255' => {'fullname' => 'Windows codepage 1255 (WinHebrew)',
     435               'mapfile' => 'win1255.ump', 'ascii_delim' => 0x80},
     436
     437    'windows_1256' => {'fullname' => 'Windows codepage 1256 (WinArabic)',
     438               'mapfile' => 'win1256.ump', 'ascii_delim' => 0x80},
     439
     440    'windows_1257' => {'fullname' => 'Windows codepage 1257 (WinBaltic)',
     441               'mapfile' => 'win1257.ump', 'ascii_delim' => 0x80},
     442
     443    'windows_1258' => {'fullname' => 'Windows codepage 1258 (Vietnamese)',
     444               'mapfile' => 'win1258.ump', 'ascii_delim' => 0x80},
     445
     446    'windows_874' => {'fullname' => 'Windows codepage 874 (Thai)',
     447              'mapfile' => 'win874.ump', 'ascii_delim' => 0x80},
     448
     449    'koi8_r' => {'fullname' => 'Cyrillic',
     450         'mapfile' => 'koi8_r.ump', 'ascii_delim' => 0x80},
     451
     452    'koi8_u' => {'fullname' => 'Cyrillic (Ukrainian)',
     453         'mapfile' => 'koi8_u.ump', 'ascii_delim' => 0x80},
     454
     455    'iscii_de' => {'fullname' => 'ISCII Devanagari',
     456           'mapfile' => 'iscii_de.ump', 'ascii_delim' => 0xA0}
     457};
     458
     459# returns a pointer to unicode array
     460sub simple2unicode {
     461    my ($encoding, $intext) = @_;
     462
     463    if (!defined ($encodings->{$encoding})) {
     464    print STDERR "unicode::simple2unicode: ERROR: $encoding encoding not supported\n";
     465    return [];
     466    }
     467
     468    my $info = $encodings->{$encoding};
     469    my $encodename = "$encoding-unicode";
     470    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings", "to_uc",
     471                      $info->{'mapfile'});
     472
     473    if (!&loadmapencoding ($encodename, $mapfile)) {
     474    print STDERR "unicode: ERROR - could not load encoding $encodename\n";
     475    return [];
     476    }
     477   
     478    my @outtext = ();
     479    my $len = length($intext);
     480    my ($c);
     481    my $i = 0;
     482
     483    while ($i < $len) {
     484    if (($c = ord(substr($intext, $i, 1))) < $info->{'ascii_delim'}) {
     485        # normal ascii character
     486        push (@outtext, $c);
     487    } else {
     488        push (@outtext, &transchar ($encodename, $c));
     489    }
     490    $i ++;
     491    }
     492    return \@outtext;
     493}
     494
     495# returns 1 if successful, 0 if unsuccessful
     496sub loadmapencoding {
     497    my ($encoding, $mapfile) = @_;
     498   
     499    # check to see if the encoding has already been loaded
     500    return 1 if (defined $translations{$encoding});
     501
     502    return 0 unless open (MAPFILE, $mapfile);
     503    binmode (MAPFILE);
     504
     505    $translations{$encoding} = [@array256];
     506    my $block = $translations{$encoding};
     507
     508    my ($in,$i,$j);
     509    while (read(MAPFILE, $in, 1) == 1) {
     510    $i = unpack ("C", $in);
     511    $block->[$i] = [@array256];
     512    for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
     513        my ($n1, $n2) = unpack ("CC", $in);
     514        $block->[$i]->[$j] = ($n1*256) + $n2;
     515    }
     516    }
     517
     518    close (MAPFILE);
     519}
     520
     521sub transchar {
     522    my ($encoding, $from) = @_;
     523    my $high = ($from / 256) % 256;
     524    my $low = $from % 256;
     525
     526    return 0 unless defined $translations{$encoding};
     527
     528    my $block = $translations{$encoding};
     529
     530    if (ref ($block->[$high]) ne "ARRAY") {
     531    return 0;
     532    }
     533    return $block->[$high]->[$low];
     534}
     535
     536
     537
     538
    3285391;
     540
Note: See TracChangeset for help on using the changeset viewer.