Changeset 1227


Ignore:
Timestamp:
2000-06-21T15:58:49+12:00 (24 years ago)
Author:
sjboddie
Message:

Modified the perl code for importing arabic encoded documents. Plugins
now support a windows_1256 and an iso_8859_6 encoding. I was briefly under
the impression that these two encodings were similar enough to be treated
the same. It turns out they're not. It appears that the Windows codepage
1256 is the most commonly used Arabic encoding so "arabic" is a synonym
for windows_1256.

Location:
trunk/gsdl
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/togb.pl

    r1226 r1227  
    3636if (!parsargv::parse(\@ARGV,
    3737             'unicode', \$unicode,
    38              'latin1', \$latin1,
     38             'iso_8859_1', \$iso_8859_1,
    3939             'gb', \$gb)) {
    4040    print STDERR "\n  usage: $0 [options]\n\n";
    4141    print STDERR "  options:\n";
    4242    print STDERR "   -unicode    input is in utf-8 or unicode (default)\n";
    43     print STDERR "   -latin1     input is in extended ascii (ISO-8859-1)\n";
     43    print STDERR "   -iso_8859_1 input is in extended ascii (ISO-8859-1 Latin 1)\n";
    4444    print STDERR "   -gb         input is in GB or GBK (simplified Chinese)\n\n";
    4545    die "\n";
     
    4747
    4848$encoding = "utf8" if $unicode;
    49 $encoding = "latin1" if $latin1;
     49$encoding = "iso_8859_1" if $iso_8859_1;
    5050$encoding = "gb" if $gb;
    5151
  • trunk/gsdl/bin/script/touc.pl

    r1226 r1227  
    3636if (!parsargv::parse(\@ARGV,
    3737             'unicode', \$unicode,
    38              'latin1', \$latin1,
    39              'arabic', \$arabic,
     38             'iso_8859_1', \$iso_8859_1,
     39             'iso_8859_6', \$iso_8859_6,
     40             'windows_1256', \$windows_1256,
    4041             'gb', \$gb)) {
    4142    print STDERR "\n  usage: $0 [options]\n\n";
    4243    print STDERR "  options:\n";
    43     print STDERR "   -unicode    input is in utf-8 or unicode (default)\n";
    44     print STDERR "   -latin1     input is in extended ascii (ISO-8859-1)\n";
    45     print STDERR "   -arabic     input is in 8 bit Arabic (ISO-8859-6)\n";
    46     print STDERR "   -gb         input is in GB or GBK (simplified Chinese)\n\n";
     44    print STDERR "   -unicode      input is in utf-8 or unicode (default)\n";
     45    print STDERR "   -iso_8859_1   input is in extended ascii (ISO-8859-1 Latin 1)\n";
     46    print STDERR "   -iso_8859_6   input is in 8 bit Arabic (ISO-8859-6)\n";
     47    print STDERR "   -windows_1256 input is in Windows 1256 (Arabic)\n";
     48    print STDERR "   -gb           input is in GB or GBK (simplified Chinese)\n\n";
    4749    die "\n";
    4850}
    4951
    5052$encoding = "utf8" if $unicode;
    51 $encoding = "latin1" if $latin1;
    52 $encoding = "arabic" if $arabic;
     53$encoding = "iso_8859_1" if $iso_8859_1;
     54$encoding = "iso_8859_6" if $iso_8859_6;
     55$encoding = "windows_1256" if $windows_1256;
    5356$encoding = "gb" if $gb;
    5457
  • trunk/gsdl/bin/script/toutf8.pl

    r1226 r1227  
    3636if (!parsargv::parse(\@ARGV,
    3737             'unicode', \$unicode,
    38              'latin1', \$latin1,
    39              'arabic', \$arabic,
     38             'iso_8859_1', \$iso_8859_1,
     39             'iso_8859_6', \$iso_8859_6,
     40             'windows_1256', \$windows_1256,
    4041             'gb', \$gb)) {
    4142    print STDERR "\n  usage: $0 [options]\n\n";
    4243    print STDERR "  options:\n";
    43     print STDERR "   -unicode    input is in utf-8 or unicode (default)\n";
    44     print STDERR "   -latin1     input is in extended ascii (ISO-8859-1)\n";
    45     print STDERR "   -arabic     input is in 8 bit Arabic (ISO-8859-6)\n";
    46     print STDERR "   -gb         input is in GB or GBK (simplified Chinese)\n\n";
     44    print STDERR "   -unicode      input is in utf-8 or unicode (default)\n";
     45    print STDERR "   -iso_8859_1   input is in extended ascii (ISO-8859-1 Latin 1)\n";
     46    print STDERR "   -iso_8859_6   input is in 8 bit Arabic (ISO-8859-6)\n";
     47    print STDERR "   -windows_1256 input is in Windows 1256 (Arabic)\n";
     48    print STDERR "   -gb           input is in GB or GBK (simplified Chinese)\n\n";
    4749    die "\n";
    4850}
    4951
    5052$encoding = "utf8" if $unicode;
    51 $encoding = "latin1" if $latin1;
    52 $encoding = "arabic" if $arabic;
     53$encoding = "iso_8859_1" if $iso_8859_1;
     54$encoding = "iso_8859_6" if $iso_8859_6;
     55$encoding = "windows_1256" if $windows_1256;
    5356$encoding = "gb" if $gb;
    5457
  • trunk/gsdl/perllib/multiread.pm

    r1224 r1227  
    2626# encodings currently supported are
    2727#
    28 # utf8     - either utf8 or unicode (automatically detected)
    29 # unicode  - just unicode (doesn't currently do endian detection)
    30 # gb       - GB
    31 # latin1   - extended ascii (iso-8859-1)
    32 # arabic   - 8 bit arabic (iso-8859-6)
     28# utf8         - either utf8 or unicode (automatically detected)
     29# unicode      - just unicode (doesn't currently do endian detection)
     30# gb           - GB
     31# iso_8859_1   - extended ascii (iso-8859-1)
     32# iso_8859_6   - 8 bit arabic (iso-8859-6)
     33# windows_1256 - Windows codepage 1256 (Arabic)
    3334
    3435package multiread;
     
    169170    }
    170171
    171     if ($self->{'encoding'} eq "latin1") {
     172    if ($self->{'encoding'} eq "iso_8859_1") {
    172173    # Latin 1 extended ascii (ISO-8859-1)
    173174    return undef if (eof ($handle));
     
    175176    }
    176177
    177     if ($self->{'encoding'} eq "arabic") {
     178    if ($self->{'encoding'} eq "iso_8859_6") {
    178179    # 8 bit Arabic (IOS-8859-6)
    179180    return undef if (eof ($handle));
    180181    return &unicode::unicode2utf8(&unicode::arabic2unicode (getc ($handle)));
     182    }
     183
     184    if ($self->{'encoding'} eq "windows_1256") {
     185    # Windows 1256 (Arabic)
     186    return undef if (eof ($handle));
     187    return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", getc ($handle)));
    181188    }
    182189
     
    242249    }
    243250   
    244     if ($self->{'encoding'} eq "latin1") {
     251    if ($self->{'encoding'} eq "iso_8859_1") {
    245252    # extended ascii (ISO-8859-1)
    246253    my $line = "";
     
    251258    }
    252259   
    253     if ($self->{'encoding'} eq "arabic") {
     260    if ($self->{'encoding'} eq "iso_8859_6") {
    254261    # 8 bit arabic (ISO-8859-6)
    255262    my $line = "";
     
    259266    return undef;
    260267    }
     268   
     269    if ($self->{'encoding'} eq "windows_1256") {
     270    # Windows 1256 (Arabic)
     271    my $line = "";
     272    if (defined ($line = <$handle>)) {
     273        return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $line));
     274    }
     275    return undef;
     276    }
    261277
    262278    # unknown encoding
     
    266282
    267283# will convert entire contents of file to utf8 and append result to $outputref
     284# this may be a slightly faster way to get the contents of a file than by
     285# recursively calling read_line()
    268286sub read_file {
    269287    my $self = shift (@_);
     
    298316    if ($self->{'encoding'} eq "gb") {
    299317    undef $/;
    300     $$outputref .= &unicode::unicode2utf8 (&gb::gb2unicode (<$handle>));
    301     $/ = "\n";
    302     return;
    303     }
    304    
    305     if ($self->{'encoding'} eq "latin1") {
    306     undef $/;
    307     $$outputref .= &unicode::ascii2utf8 (<$handle>);
    308     $/ = "\n";
    309     return;
    310     }
    311    
    312     if ($self->{'encoding'} eq "arabic") {
    313     undef $/;
    314     $$outputref .= &unicode::unicode2utf8(&unicode::arabic2unicode (<$handle>));
    315     $/ = "\n";
     318    my $text = <$handle>;
     319    $/ = "\n";
     320    $$outputref .= &unicode::unicode2utf8 (&gb::gb2unicode ($text));
     321    return;
     322    }
     323   
     324    if ($self->{'encoding'} eq "iso_8859_1") {
     325    undef $/;
     326    my $text = <$handle>;
     327    $/ = "\n";
     328    $$outputref .= &unicode::ascii2utf8 ($text);
     329    return;
     330    }
     331   
     332    if ($self->{'encoding'} eq "iso_8859_6") {
     333    my $text = <$handle>;
     334    undef $/;
     335    $/ = "\n";
     336    $$outputref .= &unicode::unicode2utf8(&unicode::arabic2unicode ($text));
     337    return;
     338    }
     339
     340    if ($self->{'encoding'} eq "windows_1256") {
     341    undef $/;
     342    my $text = <$handle>;
     343    $/ = "\n";
     344    $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $text));
    316345    return;
    317346    }
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r1219 r1227  
    4747    print STDERR "                      gb (GB or GBK simplified Chinese)\n";
    4848    print STDERR "                      iso_8859_6 (8 bit Arabic)\n";
    49     print STDERR "                      Arabic (the same as iso-8859-6)\n";
     49    print STDERR "                      windows_1256 (Windows codepage 1256 (Arabic))\n";
     50    print STDERR "                      Arabic (the same as windows_1256)\n";
    5051    print STDERR "                      utf8 (either utf8 or unicode -- automatically detected)\n";
    5152    print STDERR "                      unicode (just unicode -- doesn't currently do endian\n";
     
    5758
    5859    my $self = {};
    59     my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|Arabic|utf8|unicode)\$";
     60    my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|windows_1256|Arabic|utf8|unicode)\$";
    6061
    6162    # general options available to all plugins
     
    109110    if ($self->{'input_encoding'} =~ /^(Latin1|iso_8859_1)$/) {
    110111    $encoding = "latin1";
    111     } elsif ($self->{'input_encoding'} =~ /^(Arabic|iso_8859_6)$/) {
    112     $encoding = "arabic";
     112    } elsif ($self->{'input_encoding'} =~ /^(Arabic|windows_1256)$/) {
     113    $encoding = "windows_1256";
    113114    } else {
    114115    $encoding = $self->{'input_encoding'};
  • trunk/gsdl/perllib/unicode.pm

    r1223 r1227  
    3131package unicode;
    3232
    33 
     33%translations = ();
    3434
    3535# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
     
    6666    return $out;
    6767}
     68
     69# windows2unicode takes a windows encoding (e.g. Windows 1256 (Arabic))
     70# and returns a unicode array. These encodings are similar to but not
     71# identical to the corresponding ISO-8859 encodings.
     72#
     73# The map files for these encodings should be in unicode/MAPPINGS/WINDOWS
     74sub windows2unicode {
     75    my ($encoding, $in) = @_;
     76    my $out = [];
     77
     78    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
     79                      "WINDOWS", "$encoding.TXT");
     80    return $out unless &loadmapping ($encoding, $mapfile);
     81
     82    my $i = 0;
     83    my $len = length($in);
     84    while ($i < $len) {
     85    my $c = ord(substr ($in, $i, 1));
     86    $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
     87    push (@$out, $c);
     88    $i++;
     89    }
     90
     91    return $out;
     92}
     93
    6894
    6995# ascii2utf8 takes a (extended) ascii string and
     
    210236}
    211237
     238# loadmapping expects the mapfile to contain (at least) two
     239# tab-separated fields. The first field is the mapped value
     240# and the second field is the unicode value.
     241#
     242# It returns 1 if successful, 0 if unsuccessful
     243sub loadmapping {
     244    my ($encoding, $mapfile) = @_;
     245   
     246    my $to = "$encoding-unicode";
     247    my $from = "unicode-$encoding";
     248
     249    # check to see if the encoding has already been loaded
     250    if (defined $translations{$to} && defined $translations{$from}) {
     251    return 1;
     252    }
     253
     254    if (!open (MAPFILE, $mapfile)) {
     255    print STDERR "ERROR: unable to load mapfile $mapfile\n";
     256    return 0;
     257    }
     258
     259    my ($line, @line);
     260    $translations{$to} = {};
     261    $translations{$from} = {};
     262    while (defined ($line = <MAPFILE>)) {
     263    # remove comments
     264    $line =~ s/\#.*$//;
     265    next unless $line =~ /\S/;
     266
     267    # split the line into fields and do a few
     268    # simple sanity checks
     269    @line = split (/\t/, $line);
     270    next unless (scalar(@line) >= 2 &&
     271             $line[0] =~ /^0x/ &&
     272             $line[1] =~ /^0x/);
     273
     274    my $a = hex($line[0]);
     275    my $b = hex($line[1]);
     276
     277    $translations{$to}->{$a} = $b;
     278    $translations{$from}->{$b} = $a;
     279    }
     280
     281    close (MAPFILE);
     282
     283    return 1;
     284}
     285
    212286
    2132871;
    214 
Note: See TracChangeset for help on using the changeset viewer.