Changeset 1868


Ignore:
Timestamp:
2001-01-26T17:25:49+13:00 (23 years ago)
Author:
sjboddie
Message:

Made a bunch of changes to the building code to support lots of new
languages and encodings. It's still kind of a mess but should be fixed
up over the weekend.

Location:
trunk/gsdl
Files:
100 added
35 deleted
6 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/etc/main.cfg

    r1856 r1868  
    8484# longname  -- The display name of the given encoding. If longname isn't set
    8585#              it will default to using shortname instead.
    86 
    8786# type      -- The type of encoding. Note that for most encodings this
    8887#              value is the directory name under which the map file for
    8988#              this encoding resides in the Greenstone unicode/MAPPINGS
    9089#              directory (e.g. 'WINDOWS', 'ISO_8859' etc.). It may also
    91 #              take the values 'GB' and 'UTF8'.
     90#              take the values 'CJK' and 'UTF8'.
    9291# mapfile   -- The name of the map file for use when converting between
    9392#              utf8 and the given encoding. The mapfile option is mandatory
    94 #              for all encoding types with the exception of GB and UTF8.
     93#              for all encoding types with the exception of UTF8. If type
     94#              is CJK, mapfile is the abbreviated name of the encoding as
     95#              used by the binary mapping files (.ump files). i.e. if the
     96#              encoding uses the map files gbku.ump and ugbk.ump, mapfile
     97#              will be set to "gbk".
    9598# label     -- The standard label to which you must set the value of
    9699#              "charset" within http headers or html meta tags to get a web
     
    101104Encoding shortname=w1251 "longname=Cyrillic (Windows-1251)" type=WINDOWS mapfile=1251.TXT label=windows-1251
    102105Encoding shortname=w1256 "longname=Arabic (Windows-1256)" type=WINDOWS mapfile=1256.TXT label=windows-1256
    103 Encoding shortname=gb "longname=Simplified Chinese (GBK)" type=GB label=GBK
     106Encoding shortname=w1256 "longname=Central European (Windows-1250)" type=WINDOWS mapfile=1250.TXT label=windows-1250
     107Encoding shortname=gb "longname=Chinese Simplified (GBK)" type=CJK label=GBK mapfile=gbk
     108Encoding shortname=sjis "longname=Japanese (Shift-JIS)" type=CJK label=shift_jis mapfile=sjis
    104109Encoding shortname=koi8r "longname=Cyrillic (KOI8-R)" type=CYRILLIC mapfile=koi8_r.txt label=koi8-r
     110
     111# The following encoding is not currently supported
     112# Encoding shortname=eucjp "longname=Japanese (EUC)" type=CJK label=euc-jp mapfile=jis
    105113
    106114
  • trunk/gsdl/perllib/doc.pm

    r1844 r1868  
    130130}
    131131
    132 sub set_source_encoding {
    133     my $self = shift (@_);
    134     my ($source_encoding) = @_;
    135 
    136     $self->set_metadata_element ($self->get_top_section(),
    137                  "gsdlsourceencoding",
    138                  $source_encoding);
    139 }
    140 
    141 # returns the source_encoding as it was provided
    142 sub get_source_encoding {
    143     my $self = shift (@_);
    144    
    145     return $self->get_metadata_element ($self->get_top_section(), "gsdlsourceencoding");
    146 }
    147 
    148132sub _escape_text {
    149133    my ($text) = @_;
  • trunk/gsdl/perllib/multiread.pm

    r1844 r1868  
    3030# gb               - GB
    3131# iso_8859_[1-9]   - 8 bit extended ascii encodings
    32 # windows_125[0-6] - Windows codepages 1250 to 1256
     32# windows_125[0-8] - Windows codepages 1250 to 1258
     33# windows 874      - Windows codepage 874
     34# iscii_de         - ISCII Devanagari
     35# shift_jis        - Shift-JIS
     36# euc_jp           - EUC encoded Japanese
     37# uhc              - Unified Hangul Code (Korean)
    3338
    3439package multiread;
    3540
    3641use unicode;
    37 use gb;
     42use cjk;
    3843
    3944sub new {
     
    7378# if automatic detection between utf8 and unicode is desired
    7479# then the encoding should be initially set to utf8
    75 sub read_char {
     80sub read_unicode_char {
    7681    my $self = shift (@_);
    7782
     
    7984    return undef if ($self->{'handle'} eq "");
    8085    my $handle = $self->{'handle'};
     86    binmode ($handle);
    8187
    8288    if ($self->{'encoding'} eq "utf8") {
     
    99105            $self->{'encoding'} = "unicode";
    100106            $self->{'bigendian'} = 0;
    101             if ($ENV{'GSDLOS'} =~ /windows/i) {
    102                 binmode ($handle); # silly windows
    103             }
    104107            last;
    105108
     
    107110            $self->{'encoding'} = "unicode";
    108111            $self->{'bigendian'} = 1;
    109             if ($ENV{'GSDLOS'} =~ /windows/i) {
    110                 binmode ($handle); # silly windows
    111             }
    112112            last;
    113113            }
     
    153153    }
    154154
    155     if ($self->{'encoding'} eq "gb") {
    156     # GB or GBK
    157     return undef if (eof ($handle));
    158     my $c1 = getc ($handle);
    159     if (ord ($c1) >= 0x81) {
    160         # double byte character
    161         return undef if (eof ($handle));
    162         my $c2 = getc ($handle);
    163         return &unicode::unicode2utf8 (&gb::gb2unicode ($c1.$c2));
    164        
    165     } else {
    166         # single byte character
    167         return &unicode::ascii2utf8 ($c1);
    168     }
    169     }
    170 
    171     if ($self->{'encoding'} eq "iso_8859_1") {
    172     # special case for iso_8859_1 as &ascii2utf8($char) is faster than
    173     # &unicode2utf8(iso2unicode('1', $char))
    174     return undef if (eof ($handle));
    175     return &unicode::ascii2utf8 (getc ($handle));
    176     }
    177    
    178     if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) {
    179     return undef if (eof ($handle));
    180     return &unicode::unicode2utf8(&unicode::iso2unicode ($1, getc($handle)));
    181     }
    182 
    183     if ($self->{'encoding'} =~ /windows_(\d{4})$/) {
    184     return undef if (eof ($handle));
    185     return &unicode::unicode2utf8(&unicode::windows2unicode ($1, getc($handle)));
    186     }
    187 
    188     if ($self->{'encoding'} =~ /^koi8_[ru]$/) {
    189     return undef if (eof ($handle));
    190     return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, getc($handle)));
    191     }
    192 
    193     # unknown encoding
    194155    return undef;
    195156}
     
    211172    my $out = "";
    212173    my $thisc = "";
    213     while (defined ($thisc = $self->read_char())) {
     174    while (defined ($thisc = $self->read_unicode_char())) {
    214175        $out .= $thisc;
    215176        last if ($thisc eq "\n");
     
    219180    return undef;
    220181    }
    221 
    222182
    223183    if ($self->{'encoding'} eq "utf8") {
     
    247207    my $line = "";
    248208    if (defined ($line = <$handle>)) {
    249         return &unicode::unicode2utf8 (&gb::gb2unicode ($line));
     209        return &unicode::unicode2utf8 (&cjk::gb2unicode ($line));
    250210    }
    251211    return undef;
     
    270230    }
    271231
    272     if ($self->{'encoding'} =~ /windows_(\d{4})$/) {
     232    if ($self->{'encoding'} =~ /windows_(\d{3,4})$/) {
    273233    my $line = "";
    274234    if (defined ($line = <$handle>)) {
     
    282242    if (defined ($line = <$handle>)) {
    283243        return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $line));
     244    }
     245    return undef;
     246    }
     247
     248    if ($self->{'encoding'} eq "iscii_de") {
     249    my $line = "";
     250    if (defined ($line = <$handle>)) {
     251        return &unicode::unicode2utf8(&unicode::iscii2unicode ("Devanagari", $line));
    284252    }
    285253    return undef;
     
    328296    my $text = <$handle>;
    329297    $/ = "\n";
    330     $$outputref .= &unicode::unicode2utf8 (&gb::gb2unicode ($text));
     298    $$outputref .= &unicode::unicode2utf8 (&cjk::gb2unicode ($text));
    331299    return;
    332300    }
     
    341309    return;
    342310    }
    343    
    344     if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) {
    345     undef $/;
    346     my $text = <$handle>;
    347     $/ = "\n";
    348     $$outputref .= &unicode::unicode2utf8(&unicode::iso2unicode ($1, $text));
    349     return;
    350     }
    351 
    352     if ($self->{'encoding'} =~ /windows_(\d{4})$/) {
    353     undef $/;
    354     my $text = <$handle>;
    355     $/ = "\n";
    356     $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ($1, $text));
    357     return;
    358     }
    359 
    360     if ($self->{'encoding'} =~ /^koi8_[ru]$/) {
    361     undef $/;
    362     my $text = <$handle>;
    363     $/ = "\n";
    364     $$outputref .= &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $text));
    365     return;
    366     }
     311
     312    if ($self->{'encoding'} eq "shift_jis") {
     313    undef $/;
     314    my $text = <$handle>;
     315    $/ = "\n";
     316    $$outputref .= &unicode::unicode2utf8(&cjk::sjis2unicode ($text));
     317    return;
     318    }
     319
     320    if ($self->{'encoding'} eq "euc_jp") {
     321    undef $/;
     322    my $text = <$handle>;
     323    $/ = "\n";
     324    $$outputref .= &unicode::unicode2utf8(&cjk::eucjp2unicode ($text));
     325    return;
     326    }
     327
     328    if ($self->{'encoding'} eq "euc_kr") {
     329    undef $/;
     330    my $text = <$handle>;
     331    $/ = "\n";
     332    $$outputref .= &unicode::unicode2utf8(&cjk::euckr2unicode ($text));
     333    return;
     334    }
     335
     336    if ($self->{'encoding'} eq "uhc") {
     337    undef $/;
     338    my $text = <$handle>;
     339    $/ = "\n";
     340    $$outputref .= &unicode::unicode2utf8(&cjk::uhc2unicode ($text));
     341    return;
     342    }
     343
     344    # if we get to here we assume it's a simple 8 bit encoding
     345    undef $/;
     346    my $text = <$handle>;
     347    $/ = "\n";
     348    $$outputref .= &unicode::unicode2utf8(&unicode::simple2unicode ($self->{'encoding'}, $text));
    367349}
    368350
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r1857 r1868  
    4040%supported_encodings = (
    4141            "ascii" => "",
     42            "utf8" => "",
    4243            "iso_8859_1" => "",
    4344            "windows_1252" => "",
     
    5859            "iso_8859_9" => "",
    5960            "windows_1254" => "",
    60             "gb" => ""
     61            "gb" => "",
     62            "iscii_de" => "",
     63            "windows_1257" => "",
     64            "windows_874" => "",
     65            "windows_1258" => "",
     66            "shift_jis" => "",
     67            "euc_jp" => "",
     68            "uhc" => ""
    6169            );
    6270
     
    128136    print STDERR "                       windows_1254: Windows codepage 1254 (WinTurkish)\n";
    129137
    130     print STDERR "                       gb: GB or GBK simplified Chinese\n\n";
     138    print STDERR "                       gb: GB or GBK simplified Chinese\n";
     139
     140    print STDERR "                       iscii_de: ISCII Devanagari\n";
     141
     142    print STDERR "                       windows_1257: Windows codepage 1257 (WinBaltic)\n";
     143
     144    print STDERR "                       windows_874: Windows codepage 874 (Thai)\n";
     145
     146    print STDERR "                       windows_1258: Windows codepage 1258 (Vietnamese)\n";
     147
     148    print STDERR "                       shift_jis: Shift-JIS (Japanese)\n";
     149    print STDERR "                       euc_jp: EUC encoded Japanese\n";
     150
     151    print STDERR "                       uhc: Unified Hangul Code (Korean). This is a superset of\n";
     152    print STDERR "                            EUC encoded Korean\n\n";
     153
    131154
    132155    print STDERR "   -default_encoding If -input_encoding is set to 'auto' and the text categorization\n";
     
    144167    print STDERR "                     this value.\n\n";
    145168
    146     print STDERR "   -extract_acronyms Extract acronyms from within text and set as metadata\n\n";
     169    print STDERR "   -extract_acronyms Extract acronyms from within text and set as metadata\n";
    147170
    148171    print STDERR "   -markup_acronyms  Add acronym metadata into document text\n\n";
     
    153176    print STDERR "   -extract_email    Extract email addresses as metadata\n\n";
    154177
    155     print STDERR "   -extract_date     Extract dates pertaining to the content of documents about history\n\n";
    156     print STDERR "   -maximum_date     The maximum historical date to be used as metadata (in a Common Era date such as 1950)\n\n";
    157     print STDERR "   -maximum_century  The maximum named ceuntury to be extracted as historical metadata (e.g. 14 will extract all references up to the 14th century)\n\n";
    158     print STDERR "   -no_bibliography Do not try and block pbibliographic dates when extracting historical dates.\n\n";
     178    print STDERR "   -extract_date     Extract dates pertaining to the content of documents about history\n";
     179    print STDERR "   -maximum_date     The maximum historical date to be used as metadata (in a Common Era\n";
     180    print STDERR "                     date such as 1950)\n";
     181    print STDERR "   -maximum_century  The maximum named century to be extracted as historical metadata\n";
     182    print STDERR "                     (e.g. 14 will extract all references up to the 14th century)\n";
     183    print STDERR "   -no_bibliography  Do not try and block bibliographic dates when extracting historical dates.\n\n";
    159184}
    160185
     
    163188sub print_usage {
    164189    print STDERR "\nThis plugin has no plugin specific options\n\n";
    165 
    166190}
    167191
     
    173197    my $enc = "^(";
    174198    map {$enc .= "|$_";} keys %supported_encodings;
    175     my $denc = $enc . "|utf8|unicode)\$";
    176     $enc .= "|utf8|unicode|auto)\$";
     199    my $denc = $enc . "|unicode)\$";
     200    $enc .= "|unicode|auto)\$";
    177201   
    178202    $self->{'outhandle'} = STDERR;
     
    321345    my $doc_obj = new doc ($filename, "indexed_doc");
    322346    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
    323     $doc_obj->set_source_encoding ($encoding);
    324    
     347    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
    325348   
    326349    # read in file ($text will be in utf8)
     
    426449
    427450    if (scalar @results != 1) {
     451   
    428452    if ($self->{'input_encoding'} ne 'auto') {
    429453        if ($self->{'extract_language'} && $self->{'verbosity'}) {
     
    444468    # format language/encoding
    445469    my ($language, $encoding) = $results[0] =~ /^([^-]*)(?:-(.*))?$/;
    446     $language = $iso639::toiso639{lc($language)};
    447470    die "Invalid language\n" if !defined $language;
    448471
     
    450473    # if textcat returned no encoding info it is assumed to be iso_8859_1
    451474    $encoding = "iso_8859_1";
    452     } else {
    453     # convert to the format we expect
    454     $encoding =~ s/windows/windows_/;
    455     $encoding =~ s/iso8859/iso_8859/;
    456     $encoding =~ s/^gb.*$/gb/;
    457475    }
    458476
  • trunk/gsdl/perllib/unicode.pm

    r1844 r1868  
    133133}
    134134
     135# iscii2unicode is basically identical to iso2unicode, the only
     136# difference being that the map files live in unicode/MAPPINGS/ISCII
     137#
     138# values for $encoding may be 'Devanagari' only at present
     139sub iscii2unicode {
     140    my ($encoding, $in) = @_;
     141    my $out = [];
     142
     143    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
     144                      "ISCII", "$encoding.txt");
     145    return $out unless &loadmapping ($encoding, $mapfile);
     146
     147    my $i = 0;
     148    my $len = length($in);
     149    while ($i < $len) {
     150    my $c = ord(substr ($in, $i, 1));
     151    $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0);
     152    push (@$out, $c);
     153    $i++;
     154    }
     155
     156    return $out;
     157}
    135158
    136159# ascii2utf8 takes a (extended) ascii string and
     
    169192   
    170193    foreach $num (@$in) {
     194    next unless defined $num;
    171195    if ($num < 0x80) {
    172196        $out .= chr ($num);
     
    326350
    327351
     352
     353
     354
     355
     356
     357
     358
     359####################################################################################################
     360
     361
     362# %translations is of the form:
     363#
     364# encodings{encodingname-encodingname}->blocktranslation
     365# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
     366#
     367# Any of the top translation blocks can point to an undefined
     368# value. This data structure aims to allow fast translation and
     369# efficient storage.
     370%translations = ();
     371
     372# @array256 is used for initialisation, there must be
     373# a better way...
     374@array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     375         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     376         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     377         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     378         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     379         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     380         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     381         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     382         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     383         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     384         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     385         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     386         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     387         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     388         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
     389         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
     390
     391$encodings = {
     392    'iso_8859_1' => {'fullname' => 'Latin1 (western languages)',
     393             'mapfile' => '8859_1.ump', 'ascii_delim' => 0xA0},
     394
     395    'iso_8859_2' => {'fullname' => 'Latin2 (central and eastern european languages)',
     396             'mapfile' => '8859_2.ump', 'ascii_delim' => 0xA0},
     397
     398    'iso_8859_3' => {'fullname' => 'Latin3',
     399             'mapfile' => '8859_3.ump', 'ascii_delim' => 0xA0},
     400
     401    'iso_8859_4' => {'fullname' => 'Latin4',
     402             'mapfile' => '8859_4.ump', 'ascii_delim' => 0xA0},
     403
     404    'iso_8859_5' => {'fullname' => 'Cyrillic',
     405             'mapfile' => '8859_5.ump', 'ascii_delim' => 0xA0},
     406
     407    'iso_8859_6' => {'fullname' => 'Arabic',
     408             'mapfile' => '8859_6.ump', 'ascii_delim' => 0xA0},
     409
     410    'iso_8859_7' => {'fullname' => 'Greek',
     411             'mapfile' => '8859_7.ump', 'ascii_delim' => 0xA0},
     412
     413    'iso_8859_8' => {'fullname' => 'Hebrew',
     414             'mapfile' => '8859_8.ump', 'ascii_delim' => 0xA0},
     415
     416    'iso_8859_9' => {'fullname' => 'Latin5',
     417             'mapfile' => '8859_9.ump', 'ascii_delim' => 0xA0},
     418
     419    'windows_1250' => {'fullname' => 'Windows codepage 1250 (WinLatin2)',
     420               'mapfile' => 'win1250.ump', 'ascii_delim' => 0x80},
     421
     422    'windows_1251' => {'fullname' => 'Windows codepage 1251 (WinCyrillic)',
     423               'mapfile' => 'win1251.ump', 'ascii_delim' => 0x80},
     424
     425    'windows_1252' => {'fullname' => 'Windows codepage 1252 (WinLatin1)',
     426               'mapfile' => 'win1252.ump', 'ascii_delim' => 0x80},
     427
     428    'windows_1253' => {'fullname' => 'Windows codepage 1253 (WinGreek)',
     429               'mapfile' => 'win1253.ump', 'ascii_delim' => 0x80},
     430
     431    'windows_1254' => {'fullname' => 'Windows codepage 1254 (WinTurkish)',
     432               'mapfile' => 'win1254.ump', 'ascii_delim' => 0x80},
     433
     434    'windows_1255' => {'fullname' => 'Windows codepage 1255 (WinHebrew)',
     435               'mapfile' => 'win1255.ump', 'ascii_delim' => 0x80},
     436
     437    'windows_1256' => {'fullname' => 'Windows codepage 1256 (WinArabic)',
     438               'mapfile' => 'win1256.ump', 'ascii_delim' => 0x80},
     439
     440    'windows_1257' => {'fullname' => 'Windows codepage 1257 (WinBaltic)',
     441               'mapfile' => 'win1257.ump', 'ascii_delim' => 0x80},
     442
     443    'windows_1258' => {'fullname' => 'Windows codepage 1258 (Vietnamese)',
     444               'mapfile' => 'win1258.ump', 'ascii_delim' => 0x80},
     445
     446    'windows_874' => {'fullname' => 'Windows codepage 874 (Thai)',
     447              'mapfile' => 'win874.ump', 'ascii_delim' => 0x80},
     448
     449    'koi8_r' => {'fullname' => 'Cyrillic',
     450         'mapfile' => 'koi8_r.ump', 'ascii_delim' => 0x80},
     451
     452    'koi8_u' => {'fullname' => 'Cyrillic (Ukrainian)',
     453         'mapfile' => 'koi8_u.ump', 'ascii_delim' => 0x80},
     454
     455    'iscii_de' => {'fullname' => 'ISCII Devanagari',
     456           'mapfile' => 'iscii_de.ump', 'ascii_delim' => 0xA0}
     457};
     458
     459# returns a pointer to unicode array
     460sub simple2unicode {
     461    my ($encoding, $intext) = @_;
     462
     463    if (!defined ($encodings->{$encoding})) {
     464    print STDERR "unicode::simple2unicode: ERROR: $encoding encoding not supported\n";
     465    return [];
     466    }
     467
     468    my $info = $encodings->{$encoding};
     469    my $encodename = "$encoding-unicode";
     470    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings", "to_uc",
     471                      $info->{'mapfile'});
     472
     473    if (!&loadmapencoding ($encodename, $mapfile)) {
     474    print STDERR "unicode: ERROR - could not load encoding $encodename\n";
     475    return [];
     476    }
     477   
     478    my @outtext = ();
     479    my $len = length($intext);
     480    my ($c);
     481    my $i = 0;
     482
     483    while ($i < $len) {
     484    if (($c = ord(substr($intext, $i, 1))) < $info->{'ascii_delim'}) {
     485        # normal ascii character
     486        push (@outtext, $c);
     487    } else {
     488        push (@outtext, &transchar ($encodename, $c));
     489    }
     490    $i ++;
     491    }
     492    return \@outtext;
     493}
     494
     495# returns 1 if successful, 0 if unsuccessful
     496sub loadmapencoding {
     497    my ($encoding, $mapfile) = @_;
     498   
     499    # check to see if the encoding has already been loaded
     500    return 1 if (defined $translations{$encoding});
     501
     502    return 0 unless open (MAPFILE, $mapfile);
     503    binmode (MAPFILE);
     504
     505    $translations{$encoding} = [@array256];
     506    my $block = $translations{$encoding};
     507
     508    my ($in,$i,$j);
     509    while (read(MAPFILE, $in, 1) == 1) {
     510    $i = unpack ("C", $in);
     511    $block->[$i] = [@array256];
     512    for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
     513        my ($n1, $n2) = unpack ("CC", $in);
     514        $block->[$i]->[$j] = ($n1*256) + $n2;
     515    }
     516    }
     517
     518    close (MAPFILE);
     519}
     520
     521sub transchar {
     522    my ($encoding, $from) = @_;
     523    my $high = ($from / 256) % 256;
     524    my $low = $from % 256;
     525
     526    return 0 unless defined $translations{$encoding};
     527
     528    my $block = $translations{$encoding};
     529
     530    if (ref ($block->[$high]) ne "ARRAY") {
     531    return 0;
     532    }
     533    return $block->[$high]->[$low];
     534}
     535
     536
     537
     538
    3285391;
     540
  • trunk/gsdl/unicode/MAPPINGS/ISCII/Devanagari.txt

    r1522 r1868  
    11#  ISCII / IS 13194:1991
    2 
    3 # This table was generated by Stuart ([email protected]) for
    4 # the Greenstone Digital Library software from the ISCII (Indian
    5 # Script Code for Information Interchange). It maps from the
    6 # ISCII 7 bit code page covering Latin and Indian Scripts to
    7 # the Unicode 0900-907F range.
    82
    93# see Unicode Standard Version 2.0 pages 7-72
     
    115
    126#LETTERS
    13 0xA1    0x0901  # DEVANAGARI VOWEL-MODIFIER CHANDRABINDU
    14 0xA2    0x0902  # DEVANAGARI VOWEL-MODIFIER ANUSWAR
    15 0xA3    0x0903  # DEVANAGARI VOWEL-MODIFIER VISARG
     70xA1    0x0901  # DEVANAGARI VOWEL-MODIFIER CHANDRABINDU
     80xA2    0x0902  # DEVANAGARI VOWEL-MODIFIER ANUSWAR
     90xA3    0x0903  # DEVANAGARI VOWEL-MODIFIER VISARG
    1610
    17 0xA4    0x0905  # DEVANAGARI VOWEL A
    18 0xA5    0x0906  # DEVANAGARI VOWEL AA
    19 0xA6    0x0907  # DEVANAGARI VOWEL I
    20 0xA7    0x0908  # DEVANAGARI VOWEL II
    21 0xA8    0x0909  # DEVANAGARI VOWEL U
    22 0xA9    0x090A  # DEVANAGARI VOWEL UU
    23 0xAA    0x090B  # DEVANAGARI VOWEL RI
    24 0xAB    0x090E  # DEVANAGARI VOWEL E (SOUTHERN SCRIPTS)
    25 0xAC    0x090F  # DEVANAGARI VOWEL EY
    26 0xAD    0x0910  # DEVANAGARI VOWEL AI
    27 0xAE    0x090D  # DEVANAGARI VOWEL AYE (DEVANAGARI SCRIPT)
    28 0xAF    0x0912  # DEVANAGARI VOWEL O (SOUTHERN SCRIPTS)
     110xA4    0x0905  # DEVANAGARI VOWEL A
     120xA5    0x0906  # DEVANAGARI VOWEL AA
     130xA6    0x0907  # DEVANAGARI VOWEL I
     140xA7    0x0908  # DEVANAGARI VOWEL II
     150xA8    0x0909  # DEVANAGARI VOWEL U
     160xA9    0x090A  # DEVANAGARI VOWEL UU
     170xAA    0x090B  # DEVANAGARI VOWEL RI
     180xAB    0x090E  # DEVANAGARI VOWEL E (SOUTHERN SCRIPTS)
     190xAC    0x090F  # DEVANAGARI VOWEL EY
     200xAD    0x0910  # DEVANAGARI VOWEL AI
     210xAE    0x090D  # DEVANAGARI VOWEL AYE (DEVANAGARI SCRIPT)
     220xAF    0x0912  # DEVANAGARI VOWEL O (SOUTHERN SCRIPTS)
    2923
    30 0xB0    0x0913  # DEVANAGARI VOWEL OW
    31 0xB1    0x0914  # DEVANAGARI VOWEL AU
    32 0xB2    0x0911  # DEVANAGARI VOWEL AWE  (DEVANAGARI SCRIPT)
    33 0xB3    0x0915  # DEVANAGARI CONSONANT KA
    34 0xB4    0x0916  # DEVANAGARI CONSONANT KHA
    35 0xB5    0x0917  # DEVANAGARI CONSONANT GA
    36 0xB6    0x0918  # DEVANAGARI CONSONANT GHA
    37 0xB7    0x0919  # DEVANAGARI CONSONANT NGA
    38 0xB8    0x091A  # DEVANAGARI CONSONANT CHA
    39 0xB9    0x091B  # DEVANAGARI CONSONANT CHHA
    40 0xBA    0x091C  # DEVANAGARI CONSONANT JA
    41 0xBB    0x091D  # DEVANAGARI CONSONANT JHA
    42 0xBC    0x091E  # DEVANAGARI CONSONANT JNA
    43 0xBD    0x091F  # DEVANAGARI CONSONANT HARD TA
    44 0xBE    0x0920  # DEVANAGARI CONSONANT HARD THA
    45 0xBF    0x0921  # DEVANAGARI CONSONANT HARD DA
     240xB0    0x0913  # DEVANAGARI VOWEL OW
     250xB1    0x0914  # DEVANAGARI VOWEL AU
     260xB2    0x0911  # DEVANAGARI VOWEL AWE  (DEVANAGARI SCRIPT)
     270xB3    0x0915  # DEVANAGARI CONSONANT KA
     280xB4    0x0916  # DEVANAGARI CONSONANT KHA
     290xB5    0x0917  # DEVANAGARI CONSONANT GA
     300xB6    0x0918  # DEVANAGARI CONSONANT GHA
     310xB7    0x0919  # DEVANAGARI CONSONANT NGA
     320xB8    0x091A  # DEVANAGARI CONSONANT CHA
     330xB9    0x091B  # DEVANAGARI CONSONANT CHHA
     340xBA    0x091C  # DEVANAGARI CONSONANT JA
     350xBB    0x091D  # DEVANAGARI CONSONANT JHA
     360xBC    0x091E  # DEVANAGARI CONSONANT JNA
     370xBD    0x091F  # DEVANAGARI CONSONANT HARD TA
     380xBE    0x0920  # DEVANAGARI CONSONANT HARD THA
     390xBF    0x0921  # DEVANAGARI CONSONANT HARD DA
    4640
    47 0xC0    0x0922  # DEVANAGARI CONSONANT HARD DHA
    48 0xC1    0x0923  # DEVANAGARI CONSONANT HARD NA
    49 0xC2    0x0924  # DEVANAGARI CONSONANT SOFT TA
    50 0xC3    0x0925  # DEVANAGARI CONSONANT SOFT THA
    51 0xC4    0x0926  # DEVANAGARI CONSONANT SOFT DA
    52 0xC5    0x0927  # DEVANAGARI CONSONANT SOFT DHA
    53 0xC6    0x0928  # DEVANAGARI CONSONANT SOFT NA
    54 0xC7    0x0929  # DEVANAGARI CONSONANT NA (TAMIL)
    55 0xC8    0x092A  # DEVANAGARI CONSONANT PA
    56 0xC9    0x092B  # DEVANAGARI CONSONANT PHA
    57 0xCA    0x092C  # DEVANAGARI CONSONANT BA
    58 0xCB    0x092D  # DEVANAGARI CONSONANT BHA
    59 0xCC    0x092E  # DEVANAGARI CONSONANT MA
    60 0xCD    0x092F  # DEVANAGARI CONSONANT YA
     410xC0    0x0922  # DEVANAGARI CONSONANT HARD DHA
     420xC1    0x0923  # DEVANAGARI CONSONANT HARD NA
     430xC2    0x0924  # DEVANAGARI CONSONANT SOFT TA
     440xC3    0x0925  # DEVANAGARI CONSONANT SOFT THA
     450xC4    0x0926  # DEVANAGARI CONSONANT SOFT DA
     460xC5    0x0927  # DEVANAGARI CONSONANT SOFT DHA
     470xC6    0x0928  # DEVANAGARI CONSONANT SOFT NA
     480xC7    0x0929  # DEVANAGARI CONSONANT NA (TAMIL)
     490xC8    0x092A  # DEVANAGARI CONSONANT PA
     500xC9    0x092B  # DEVANAGARI CONSONANT PHA
     510xCA    0x092C  # DEVANAGARI CONSONANT BA
     520xCB    0x092D  # DEVANAGARI CONSONANT BHA
     530xCC    0x092E  # DEVANAGARI CONSONANT MA
     540xCD    0x092F  # DEVANAGARI CONSONANT YA
    6155# WARNING: THIS CHARACTER IS NON-CANNONICAL
    62 0xCE    0x095F  # DEVANAGARI CONSONANT JKA (BENGALI, ASSAMESE & ORIYA)
    63 0xCF    0x0930  # DEVANAGARI CONSONANT RA
     560xCE    0x095F  # DEVANAGARI CONSONANT JKA (BENGALI, ASSAMESE & ORIYA)
     570xCF    0x0930  # DEVANAGARI CONSONANT RA
    6458
    65 0xD0    0x0931  # DEVANAGARI CONSONANT HARD RA (SOUTHERN SCRIPTS)
    66 0xD1    0x0932  # DEVANAGARI CONSONANT LA
    67 0xD2    0x0933  # DEVANAGARI CONSONANT HARD LA
    68 0xD3    0x0934  # DEVANAGARI CONSONANT ZHA (TAMIL & MALAYALAM)
    69 0xD4    0x0935  # DEVANAGARI CONSONANT VA
    70 0xD5    0x0936  # DEVANAGARI CONSONANT SHA
    71 0xD6    0x0937  # DEVANAGARI CONSONANT HARD SHA
    72 0xD7    0x0938  # DEVANAGARI CONSONANT SA
    73 0xD8    0x0939  # DEVANAGARI CONSONANT HA
    74 #0xD9    0x0900  # DEVANAGARI INVISIBLE (NO UNICODE EQUALIVENT)
    75 0xDA    0x093E  # DEVANAGARI VOWEL SIGN AA
    76 0xDB    0x093F  # DEVANAGARI VOWEL SIGN I
    77 0xDC    0x0940  # DEVANAGARI VOWEL SIGN II
    78 0xDD    0x0941  # DEVANAGARI VOWEL SIGN U
    79 0xDE    0x0942  # DEVANAGARI VOWEL SIGN UU
    80 0xDF    0x0943  # DEVANAGARI VOWEL SIGN RI
     590xD0    0x0931  # DEVANAGARI CONSONANT HARD RA (SOUTHERN SCRIPTS)
     600xD1    0x0932  # DEVANAGARI CONSONANT LA
     610xD2    0x0933  # DEVANAGARI CONSONANT HARD LA
     620xD3    0x0934  # DEVANAGARI CONSONANT ZHA (TAMIL & MALAYALAM)
     630xD4    0x0935  # DEVANAGARI CONSONANT VA
     640xD5    0x0936  # DEVANAGARI CONSONANT SHA
     650xD6    0x0937  # DEVANAGARI CONSONANT HARD SHA
     660xD7    0x0938  # DEVANAGARI CONSONANT SA
     670xD8    0x0939  # DEVANAGARI CONSONANT HA
     68#0xD9   0x0900  # DEVANAGARI INVISIBLE (NO UNICODE EQUALIVENT)
     690xDA    0x093E  # DEVANAGARI VOWEL SIGN AA
     700xDB    0x093F  # DEVANAGARI VOWEL SIGN I
     710xDC    0x0940  # DEVANAGARI VOWEL SIGN II
     720xDD    0x0941  # DEVANAGARI VOWEL SIGN U
     730xDE    0x0942  # DEVANAGARI VOWEL SIGN UU
     740xDF    0x0943  # DEVANAGARI VOWEL SIGN RI
    8175
    82 0xE0    0x0946  # DEVANAGARI VOWEL SIGN E (SOUTHERN SCRIPTS)
    83 0xE1    0x0947  # DEVANAGARI VOWEL SIGN EY
    84 0xE2    0x0948  # DEVANAGARI VOWEL SIGN AI
    85 0xE3    0x0945  # DEVANAGARI VOWEL SIGN AYE (DEVANAGARI SCRIPT)
    86 0xE4    0x094A  # DEVANAGARI VOWEL SIGN O SOUTHERN SCRIPTS)
    87 0xE5    0x094B  # DEVANAGARI VOWEL SIGN OW
    88 0xE6    0x094C  # DEVANAGARI VOWEL SIGN AU
    89 0xE7    0x0949  # DEVANAGARI VOWEL SIGN AWE (DEVANAGARI SCRIPT)
    90 0xE8    0x094D  # DEVANAGARI VOWEL SIGN OMISSION SIGN (HALANT)
     760xE0    0x0946  # DEVANAGARI VOWEL SIGN E (SOUTHERN SCRIPTS)
     770xE1    0x0947  # DEVANAGARI VOWEL SIGN EY
     780xE2    0x0948  # DEVANAGARI VOWEL SIGN AI
     790xE3    0x0945  # DEVANAGARI VOWEL SIGN AYE (DEVANAGARI SCRIPT)
     800xE4    0x094A  # DEVANAGARI VOWEL SIGN O SOUTHERN SCRIPTS)
     810xE5    0x094B  # DEVANAGARI VOWEL SIGN OW
     820xE6    0x094C  # DEVANAGARI VOWEL SIGN AU
     830xE7    0x0949  # DEVANAGARI VOWEL SIGN AWE (DEVANAGARI SCRIPT)
     840xE8    0x094D  # DEVANAGARI VOWEL SIGN OMISSION SIGN (HALANT)
    9185
    9286#PUNCTUATION
    93 0xE9    0x093C  # DEVANAGARI DIACRITIC SIGN (NUKTA)
    94 0xEA    0x0964  # DEVANAGARI FULL STOP
     870xE9    0x093C  # DEVANAGARI DIACRITIC SIGN (NUKTA)
     880xEA    0x0964  # DEVANAGARI FULL STOP
    9589
    9690#DIGITS
    97 0xF1    0x0966  # DEVANAGARI DIGIT ZERO
    98 0xF2    0x0967  # DEVANAGARI DIGIT ONE
    99 0xF3    0x0968  # DEVANAGARI DIGIT TWO
    100 0xF4    0x0969  # DEVANAGARI DIGIT THREE
    101 0xF5    0x096A  # DEVANAGARI DIGIT FOUR
    102 0xF6    0x096B  # DEVANAGARI DIGIT FIVE
    103 0xF7    0x096C  # DEVANAGARI DIGIT SIX
    104 0xF8    0x096D  # DEVANAGARI DIGIT SEVEN
    105 0xF9    0x096E  # DEVANAGARI DIGIT EIGHT
    106 0xFA    0x096F  # DEVANAGARI DIGIT NINE
     910xF1    0x0966  # DEVANAGARI DIGIT ZERO
     920xF2    0x0967  # DEVANAGARI DIGIT ONE
     930xF3    0x0968  # DEVANAGARI DIGIT TWO
     940xF4    0x0969  # DEVANAGARI DIGIT THREE
     950xF5    0x096A  # DEVANAGARI DIGIT FOUR
     960xF6    0x096B  # DEVANAGARI DIGIT FIVE
     970xF7    0x096C  # DEVANAGARI DIGIT SIX
     980xF8    0x096D  # DEVANAGARI DIGIT SEVEN
     990xF9    0x096E  # DEVANAGARI DIGIT EIGHT
     1000xFA    0x096F  # DEVANAGARI DIGIT NINE
Note: See TracChangeset for help on using the changeset viewer.