Context Navigation

← Previous Changeset
Next Changeset →

Changeset 1227

Timestamp:

2000-06-21T15:58:49+12:00 (24 years ago)

Author:

sjboddie

Message:

Modified the perl code for importing arabic encoded documents. Plugins
now support a windows_1256 and an iso_8859_6 encoding. I was briefly under
the impression that these two encodings were similar enough to be treated
the same. It turns out they're not. It appears that the Windows codepage
1256 is the most commonly used Arabic encoding so "arabic" is a synonym
for windows_1256.

Location:

trunk/gsdl

Files:

: 6 edited

bin/script/togb.pl (modified) (2 diffs)
bin/script/touc.pl (modified) (1 diff)
bin/script/toutf8.pl (modified) (1 diff)
perllib/multiread.pm (modified) (8 diffs)
perllib/plugins/BasPlug.pm (modified) (3 diffs)
perllib/unicode.pm (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/bin/script/togb.pl

-              r1226
+              r1227
 if (!parsargv::parse(\@ARGV,
              'unicode', \$unicode,
              'latin1', \$latin1,
+             'iso_8859_1', \$iso_8859_1,
              'gb', \$gb)) {
     print STDERR "\n  usage: $0 [options]\n\n";
     print STDERR "  options:\n";
     print STDERR "   -unicode    input is in utf-8 or unicode (default)\n";
     print STDERR "   -latin1     input is in extended ascii (ISO-8859-1)\n";
+    print STDERR "   -iso_8859_1 input is in extended ascii (ISO-8859-1 Latin 1)\n";
     print STDERR "   -gb         input is in GB or GBK (simplified Chinese)\n\n";
     die "\n";
 …
 $encoding = "utf8" if $unicode;
 $encoding = "latin1" if $latin1;
+$encoding = "iso_8859_1" if $iso_8859_1;
 $encoding = "gb" if $gb;

trunk/gsdl/bin/script/touc.pl

-              r1226
+              r1227
 if (!parsargv::parse(\@ARGV,
              'unicode', \$unicode,
+             'latin1', \$latin1,
+             'arabic', \$arabic,
+             'iso_8859_1', \$iso_8859_1,
+             'iso_8859_6', \$iso_8859_6,
+             'windows_1256', \$windows_1256,
              'gb', \$gb)) {
     print STDERR "\n  usage: $0 [options]\n\n";
     print STDERR "  options:\n";
+    print STDERR "   -unicode    input is in utf-8 or unicode (default)\n";
+    print STDERR "   -latin1     input is in extended ascii (ISO-8859-1)\n";
+    print STDERR "   -arabic     input is in 8 bit Arabic (ISO-8859-6)\n";
+    print STDERR "   -gb         input is in GB or GBK (simplified Chinese)\n\n";
+    print STDERR "   -unicode      input is in utf-8 or unicode (default)\n";
+    print STDERR "   -iso_8859_1   input is in extended ascii (ISO-8859-1 Latin 1)\n";
+    print STDERR "   -iso_8859_6   input is in 8 bit Arabic (ISO-8859-6)\n";
+    print STDERR "   -windows_1256 input is in Windows 1256 (Arabic)\n";
+    print STDERR "   -gb           input is in GB or GBK (simplified Chinese)\n\n";
     die "\n";
+}
 $encoding = "utf8" if $unicode;
+$encoding = "latin1" if $latin1;
+$encoding = "arabic" if $arabic;
+$encoding = "iso_8859_1" if $iso_8859_1;
+$encoding = "iso_8859_6" if $iso_8859_6;
+$encoding = "windows_1256" if $windows_1256;
 $encoding = "gb" if $gb;

trunk/gsdl/bin/script/toutf8.pl

-              r1226
+              r1227
 if (!parsargv::parse(\@ARGV,
              'unicode', \$unicode,
+             'latin1', \$latin1,
+             'arabic', \$arabic,
+             'iso_8859_1', \$iso_8859_1,
+             'iso_8859_6', \$iso_8859_6,
+             'windows_1256', \$windows_1256,
              'gb', \$gb)) {
     print STDERR "\n  usage: $0 [options]\n\n";
     print STDERR "  options:\n";
+    print STDERR "   -unicode    input is in utf-8 or unicode (default)\n";
+    print STDERR "   -latin1     input is in extended ascii (ISO-8859-1)\n";
+    print STDERR "   -arabic     input is in 8 bit Arabic (ISO-8859-6)\n";
+    print STDERR "   -gb         input is in GB or GBK (simplified Chinese)\n\n";
+    print STDERR "   -unicode      input is in utf-8 or unicode (default)\n";
+    print STDERR "   -iso_8859_1   input is in extended ascii (ISO-8859-1 Latin 1)\n";
+    print STDERR "   -iso_8859_6   input is in 8 bit Arabic (ISO-8859-6)\n";
+    print STDERR "   -windows_1256 input is in Windows 1256 (Arabic)\n";
+    print STDERR "   -gb           input is in GB or GBK (simplified Chinese)\n\n";
     die "\n";
+}
 $encoding = "utf8" if $unicode;
+$encoding = "latin1" if $latin1;
+$encoding = "arabic" if $arabic;
+$encoding = "iso_8859_1" if $iso_8859_1;
+$encoding = "iso_8859_6" if $iso_8859_6;
+$encoding = "windows_1256" if $windows_1256;
 $encoding = "gb" if $gb;

trunk/gsdl/perllib/multiread.pm

-              r1224
+              r1227
 # encodings currently supported are
+#
+# utf8     - either utf8 or unicode (automatically detected)
+# unicode  - just unicode (doesn't currently do endian detection)
+# gb       - GB
+# latin1   - extended ascii (iso-8859-1)
+# arabic   - 8 bit arabic (iso-8859-6)
+# utf8         - either utf8 or unicode (automatically detected)
+# unicode      - just unicode (doesn't currently do endian detection)
+# gb           - GB
+# iso_8859_1   - extended ascii (iso-8859-1)
+# iso_8859_6   - 8 bit arabic (iso-8859-6)
+# windows_1256 - Windows codepage 1256 (Arabic)
 package multiread;
 …
+    }
     if ($self->{'encoding'} eq "latin1") {
+    if ($self->{'encoding'} eq "iso_8859_1") {
     # Latin 1 extended ascii (ISO-8859-1)
     return undef if (eof ($handle));
 …
+    }
     if ($self->{'encoding'} eq "arabic") {
+    if ($self->{'encoding'} eq "iso_8859_6") {
     # 8 bit Arabic (IOS-8859-6)
     return undef if (eof ($handle));
     return &unicode::unicode2utf8(&unicode::arabic2unicode (getc ($handle)));
+    }
+    if ($self->{'encoding'} eq "windows_1256") {
+    # Windows 1256 (Arabic)
+    return undef if (eof ($handle));
+    return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", getc ($handle)));
+    }
 …
+    }
     if ($self->{'encoding'} eq "latin1") {
+    if ($self->{'encoding'} eq "iso_8859_1") {
     # extended ascii (ISO-8859-1)
     my $line = "";
 …
+    }
     if ($self->{'encoding'} eq "arabic") {
+    if ($self->{'encoding'} eq "iso_8859_6") {
     # 8 bit arabic (ISO-8859-6)
     my $line = "";
 …
     return undef;
+    }
+    if ($self->{'encoding'} eq "windows_1256") {
+    # Windows 1256 (Arabic)
+    my $line = "";
+    if (defined ($line = <$handle>)) {
+        return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $line));
+    }
+    return undef;
+    }
     # unknown encoding
 …
 # will convert entire contents of file to utf8 and append result to $outputref
+# this may be a slightly faster way to get the contents of a file than by
+# recursively calling read_line()
 sub read_file {
     my $self = shift (@_);
 …
     if ($self->{'encoding'} eq "gb") {
     undef $/;
+    $$outputref .= &unicode::unicode2utf8 (&gb::gb2unicode (<$handle>));
+    $/ = "\n";
+    return;
+    }
+    if ($self->{'encoding'} eq "latin1") {
+    undef $/;
+    $$outputref .= &unicode::ascii2utf8 (<$handle>);
+    $/ = "\n";
+    return;
+    }
+    if ($self->{'encoding'} eq "arabic") {
+    undef $/;
+    $$outputref .= &unicode::unicode2utf8(&unicode::arabic2unicode (<$handle>));
+    $/ = "\n";
+    my $text = <$handle>;
+    $/ = "\n";
+    $$outputref .= &unicode::unicode2utf8 (&gb::gb2unicode ($text));
+    return;
+    }
+    if ($self->{'encoding'} eq "iso_8859_1") {
+    undef $/;
+    my $text = <$handle>;
+    $/ = "\n";
+    $$outputref .= &unicode::ascii2utf8 ($text);
+    return;
+    }
+    if ($self->{'encoding'} eq "iso_8859_6") {
+    my $text = <$handle>;
+    undef $/;
+    $/ = "\n";
+    $$outputref .= &unicode::unicode2utf8(&unicode::arabic2unicode ($text));
+    return;
+    }
+    if ($self->{'encoding'} eq "windows_1256") {
+    undef $/;
+    my $text = <$handle>;
+    $/ = "\n";
+    $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $text));
     return;
+    }

trunk/gsdl/perllib/plugins/BasPlug.pm

-              r1219
+              r1227
     print STDERR "                      gb (GB or GBK simplified Chinese)\n";
     print STDERR "                      iso_8859_6 (8 bit Arabic)\n";
+    print STDERR "                      Arabic (the same as iso-8859-6)\n";
+    print STDERR "                      windows_1256 (Windows codepage 1256 (Arabic))\n";
+    print STDERR "                      Arabic (the same as windows_1256)\n";
     print STDERR "                      utf8 (either utf8 or unicode -- automatically detected)\n";
     print STDERR "                      unicode (just unicode -- doesn't currently do endian\n";
 …
     my $self = {};
     my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|Arabic|utf8|unicode)\$";
+    my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|windows_1256|Arabic|utf8|unicode)\$";
     # general options available to all plugins
 …
     if ($self->{'input_encoding'} =~ /^(Latin1|iso_8859_1)$/) {
     $encoding = "latin1";
     } elsif ($self->{'input_encoding'} =~ /^(Arabic|iso_8859_6)$/) {
     $encoding = "arabic";
+    } elsif ($self->{'input_encoding'} =~ /^(Arabic|windows_1256)$/) {
+    $encoding = "windows_1256";
     } else {
     $encoding = $self->{'input_encoding'};

trunk/gsdl/perllib/unicode.pm

-              r1223
+              r1227
 package unicode;
+%translations = ();
 # ascii2unicode takes an (extended) ascii string (ISO-8859-1)
 …
     return $out;
+}
+# windows2unicode takes a windows encoding (e.g. Windows 1256 (Arabic))
+# and returns a unicode array. These encodings are similar to but not
+# identical to the corresponding ISO-8859 encodings.
+#
+# The map files for these encodings should be in unicode/MAPPINGS/WINDOWS
+sub windows2unicode {
+    my ($encoding, $in) = @_;
+    my $out = [];
+    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
+                      "WINDOWS", "$encoding.TXT");
+    return $out unless &loadmapping ($encoding, $mapfile);
+    my $i = 0;
+    my $len = length($in);
+    while ($i < $len) {
+    my $c = ord(substr ($in, $i, 1));
+    $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
+    push (@$out, $c);
+    $i++;
+    }
+    return $out;
+}
 # ascii2utf8 takes a (extended) ascii string and
 …
+}
+# loadmapping expects the mapfile to contain (at least) two
+# tab-separated fields. The first field is the mapped value
+# and the second field is the unicode value.
+#
+# It returns 1 if successful, 0 if unsuccessful
+sub loadmapping {
+    my ($encoding, $mapfile) = @_;
+    my $to = "$encoding-unicode";
+    my $from = "unicode-$encoding";
+    # check to see if the encoding has already been loaded
+    if (defined $translations{$to} && defined $translations{$from}) {
+    return 1;
+    }
+    if (!open (MAPFILE, $mapfile)) {
+    print STDERR "ERROR: unable to load mapfile $mapfile\n";
+    return 0;
+    }
+    my ($line, @line);
+    $translations{$to} = {};
+    $translations{$from} = {};
+    while (defined ($line = <MAPFILE>)) {
+    # remove comments
+    $line =~ s/\#.*$//;
+    next unless $line =~ /\S/;
+    # split the line into fields and do a few
+    # simple sanity checks
+    @line = split (/\t/, $line);
+    next unless (scalar(@line) >= 2 &&
+             $line[0] =~ /^0x/ &&
+             $line[1] =~ /^0x/);
+    my $a = hex($line[0]);
+    my $b = hex($line[1]);
+    $translations{$to}->{$a} = $b;
+    $translations{$from}->{$b} = $a;
+    }
+    close (MAPFILE);
+    return 1;
+}
 ;

Note: See TracChangeset for help on using the changeset viewer.