Ignore:
Timestamp:
2000-07-13T10:21:53+12:00 (24 years ago)
Author:
sjboddie
Message:

merged changes to trunk into New_Config_Format branch

File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/New_Config_Format-branch/gsdl/perllib/unicode.pm

    r537 r1279  
    3131package unicode;
    3232
    33 
    34 
    35 # ascii2unicode takes a (extended) ascii string and
    36 # returns a unicode array.
     33%translations = ();
     34
     35# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
     36# and returns a unicode array.
    3737sub ascii2unicode {
    3838    my ($in) = @_;
     
    4343    while ($i < $len) {
    4444    push (@$out, ord(substr ($in, $i, 1)));
     45    $i++;
     46    }
     47
     48    return $out;
     49}
     50
     51# arabic2unicode takes an 8 bit Arabic string (ISO-8859-6)
     52# and returns a unicode array
     53sub arabic2unicode {
     54    my ($in) = @_;
     55    my $out = [];
     56
     57    my $i = 0;
     58    my $len = length($in);
     59    while ($i < $len) {
     60    my $c = ord(substr ($in, $i, 1));
     61    $c += (1567-191) if ($c >= 0x80);
     62    push (@$out, $c);
     63    $i++;
     64    }
     65
     66    return $out;
     67}
     68
     69# windows2unicode takes a windows encoding (e.g. Windows 1256 (Arabic))
     70# and returns a unicode array. These encodings are similar to but not
     71# identical to the corresponding ISO-8859 encodings.
     72#
     73# The map files for these encodings should be in unicode/MAPPINGS/WINDOWS
     74sub windows2unicode {
     75    my ($encoding, $in) = @_;
     76    my $out = [];
     77
     78    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
     79                      "WINDOWS", "$encoding.TXT");
     80    return $out unless &loadmapping ($encoding, $mapfile);
     81
     82    my $i = 0;
     83    my $len = length($in);
     84    while ($i < $len) {
     85    my $c = ord(substr ($in, $i, 1));
     86    $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
     87    push (@$out, $c);
    4588    $i++;
    4689    }
     
    193236}
    194237
     238# loadmapping expects the mapfile to contain (at least) two
     239# tab-separated fields. The first field is the mapped value
     240# and the second field is the unicode value.
     241#
     242# It returns 1 if successful, 0 if unsuccessful
     243sub loadmapping {
     244    my ($encoding, $mapfile) = @_;
     245   
     246    my $to = "$encoding-unicode";
     247    my $from = "unicode-$encoding";
     248
     249    # check to see if the encoding has already been loaded
     250    if (defined $translations{$to} && defined $translations{$from}) {
     251    return 1;
     252    }
     253
     254    if (!open (MAPFILE, $mapfile)) {
     255    print STDERR "ERROR: unable to load mapfile $mapfile\n";
     256    return 0;
     257    }
     258
     259    my ($line, @line);
     260    $translations{$to} = {};
     261    $translations{$from} = {};
     262    while (defined ($line = <MAPFILE>)) {
     263    # remove comments
     264    $line =~ s/\#.*$//;
     265    next unless $line =~ /\S/;
     266
     267    # split the line into fields and do a few
     268    # simple sanity checks
     269    @line = split (/\t/, $line);
     270    next unless (scalar(@line) >= 2 &&
     271             $line[0] =~ /^0x/ &&
     272             $line[1] =~ /^0x/);
     273
     274    my $a = hex($line[0]);
     275    my $b = hex($line[1]);
     276
     277    $translations{$to}->{$a} = $b;
     278    $translations{$from}->{$b} = $a;
     279    }
     280
     281    close (MAPFILE);
     282
     283    return 1;
     284}
     285
    195286
    1962871;
    197 
Note: See TracChangeset for help on using the changeset viewer.