Changeset 1868

Show
Ignore:
Timestamp:
26.01.2001 17:25:49 (19 years ago)
Author:
sjboddie
Message:

Made a bunch of changes to the building code to support lots of new
languages and encodings. It's still kind of a mess but should be fixed
up over the weekend.

Location:
trunk/gsdl
Files:
100 added
35 removed
6 modified

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/etc/main.cfg

    r1856 r1868  
    8484# longname  -- The display name of the given encoding. If longname isn't set 
    8585#              it will default to using shortname instead. 
    86  
    8786# type      -- The type of encoding. Note that for most encodings this 
    8887#              value is the directory name under which the map file for 
    8988#              this encoding resides in the Greenstone unicode/MAPPINGS 
    9089#              directory (e.g. 'WINDOWS', 'ISO_8859' etc.). It may also 
    91 #              take the values 'GB' and 'UTF8'. 
     90#              take the values 'CJK' and 'UTF8'. 
    9291# mapfile   -- The name of the map file for use when converting between 
    9392#              utf8 and the given encoding. The mapfile option is mandatory 
    94 #              for all encoding types with the exception of GB and UTF8. 
     93#              for all encoding types with the exception of UTF8. If type 
     94#              is CJK, mapfile is the abbreviated name of the encoding as 
     95#              used by the binary mapping files (.ump files). i.e. if the 
     96#              encoding uses the map files gbku.ump and ugbk.ump, mapfile 
     97#              will be set to "gbk". 
    9598# label     -- The standard label to which you must set the value of 
    9699#              "charset" within http headers or html meta tags to get a web 
     
    101104Encoding shortname=w1251 "longname=Cyrillic (Windows-1251)" type=WINDOWS mapfile=1251.TXT label=windows-1251 
    102105Encoding shortname=w1256 "longname=Arabic (Windows-1256)" type=WINDOWS mapfile=1256.TXT label=windows-1256 
    103 Encoding shortname=gb "longname=Simplified Chinese (GBK)" type=GB label=GBK 
     106Encoding shortname=w1256 "longname=Central European (Windows-1250)" type=WINDOWS mapfile=1250.TXT label=windows-1250 
     107Encoding shortname=gb "longname=Chinese Simplified (GBK)" type=CJK label=GBK mapfile=gbk 
     108Encoding shortname=sjis "longname=Japanese (Shift-JIS)" type=CJK label=shift_jis mapfile=sjis 
    104109Encoding shortname=koi8r "longname=Cyrillic (KOI8-R)" type=CYRILLIC mapfile=koi8_r.txt label=koi8-r 
     110 
     111# The following encoding is not currently supported 
     112# Encoding shortname=eucjp "longname=Japanese (EUC)" type=CJK label=euc-jp mapfile=jis 
    105113 
    106114 
  • trunk/gsdl/perllib/doc.pm

    r1844 r1868  
    130130} 
    131131 
    132 sub set_source_encoding { 
    133     my $self = shift (@_); 
    134     my ($source_encoding) = @_; 
    135  
    136     $self->set_metadata_element ($self->get_top_section(),  
    137                  "gsdlsourceencoding",  
    138                  $source_encoding); 
    139 } 
    140  
    141 # returns the source_encoding as it was provided 
    142 sub get_source_encoding { 
    143     my $self = shift (@_); 
    144      
    145     return $self->get_metadata_element ($self->get_top_section(), "gsdlsourceencoding"); 
    146 } 
    147  
    148132sub _escape_text { 
    149133    my ($text) = @_; 
  • trunk/gsdl/perllib/multiread.pm

    r1844 r1868  
    3030# gb               - GB 
    3131# iso_8859_[1-9]   - 8 bit extended ascii encodings 
    32 # windows_125[0-6] - Windows codepages 1250 to 1256 
     32# windows_125[0-8] - Windows codepages 1250 to 1258 
     33# windows 874      - Windows codepage 874 
     34# iscii_de         - ISCII Devanagari 
     35# shift_jis        - Shift-JIS 
     36# euc_jp           - EUC encoded Japanese 
     37# uhc              - Unified Hangul Code (Korean) 
    3338 
    3439package multiread; 
    3540 
    3641use unicode; 
    37 use gb; 
     42use cjk; 
    3843 
    3944sub new { 
     
    7378# if automatic detection between utf8 and unicode is desired 
    7479# then the encoding should be initially set to utf8 
    75 sub read_char { 
     80sub read_unicode_char { 
    7681    my $self = shift (@_); 
    7782 
     
    7984    return undef if ($self->{'handle'} eq ""); 
    8085    my $handle = $self->{'handle'}; 
     86    binmode ($handle); 
    8187 
    8288    if ($self->{'encoding'} eq "utf8") { 
     
    99105            $self->{'encoding'} = "unicode"; 
    100106            $self->{'bigendian'} = 0; 
    101             if ($ENV{'GSDLOS'} =~ /windows/i) { 
    102                 binmode ($handle); # silly windows 
    103             } 
    104107            last; 
    105108 
     
    107110            $self->{'encoding'} = "unicode"; 
    108111            $self->{'bigendian'} = 1; 
    109             if ($ENV{'GSDLOS'} =~ /windows/i) { 
    110                 binmode ($handle); # silly windows 
    111             } 
    112112            last; 
    113113            } 
     
    153153    } 
    154154 
    155     if ($self->{'encoding'} eq "gb") { 
    156     # GB or GBK 
    157     return undef if (eof ($handle)); 
    158     my $c1 = getc ($handle); 
    159     if (ord ($c1) >= 0x81) { 
    160         # double byte character 
    161         return undef if (eof ($handle)); 
    162         my $c2 = getc ($handle); 
    163         return &unicode::unicode2utf8 (&gb::gb2unicode ($c1.$c2)); 
    164          
    165     } else { 
    166         # single byte character 
    167         return &unicode::ascii2utf8 ($c1); 
    168     } 
    169     } 
    170  
    171     if ($self->{'encoding'} eq "iso_8859_1") { 
    172     # special case for iso_8859_1 as &ascii2utf8($char) is faster than 
    173     # &unicode2utf8(iso2unicode('1', $char)) 
    174     return undef if (eof ($handle)); 
    175     return &unicode::ascii2utf8 (getc ($handle)); 
    176     } 
    177      
    178     if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) { 
    179     return undef if (eof ($handle)); 
    180     return &unicode::unicode2utf8(&unicode::iso2unicode ($1, getc($handle))); 
    181     } 
    182  
    183     if ($self->{'encoding'} =~ /windows_(\d{4})$/) { 
    184     return undef if (eof ($handle)); 
    185     return &unicode::unicode2utf8(&unicode::windows2unicode ($1, getc($handle))); 
    186     } 
    187  
    188     if ($self->{'encoding'} =~ /^koi8_[ru]$/) { 
    189     return undef if (eof ($handle)); 
    190     return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, getc($handle))); 
    191     } 
    192  
    193     # unknown encoding 
    194155    return undef; 
    195156} 
     
    211172    my $out = ""; 
    212173    my $thisc = ""; 
    213     while (defined ($thisc = $self->read_char())) { 
     174    while (defined ($thisc = $self->read_unicode_char())) { 
    214175        $out .= $thisc; 
    215176        last if ($thisc eq "\n"); 
     
    219180    return undef; 
    220181    } 
    221  
    222182 
    223183    if ($self->{'encoding'} eq "utf8") { 
     
    247207    my $line = ""; 
    248208    if (defined ($line = <$handle>)) { 
    249         return &unicode::unicode2utf8 (&gb::gb2unicode ($line)); 
     209        return &unicode::unicode2utf8 (&cjk::gb2unicode ($line)); 
    250210    } 
    251211    return undef; 
     
    270230    } 
    271231 
    272     if ($self->{'encoding'} =~ /windows_(\d{4})$/) { 
     232    if ($self->{'encoding'} =~ /windows_(\d{3,4})$/) { 
    273233    my $line = ""; 
    274234    if (defined ($line = <$handle>)) { 
     
    282242    if (defined ($line = <$handle>)) { 
    283243        return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $line)); 
     244    } 
     245    return undef; 
     246    } 
     247 
     248    if ($self->{'encoding'} eq "iscii_de") { 
     249    my $line = ""; 
     250    if (defined ($line = <$handle>)) { 
     251        return &unicode::unicode2utf8(&unicode::iscii2unicode ("Devanagari", $line)); 
    284252    } 
    285253    return undef; 
     
    328296    my $text = <$handle>; 
    329297    $/ = "\n"; 
    330     $$outputref .= &unicode::unicode2utf8 (&gb::gb2unicode ($text)); 
     298    $$outputref .= &unicode::unicode2utf8 (&cjk::gb2unicode ($text)); 
    331299    return; 
    332300    } 
     
    341309    return; 
    342310    } 
    343      
    344     if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) { 
    345     undef $/; 
    346     my $text = <$handle>; 
    347     $/ = "\n"; 
    348     $$outputref .= &unicode::unicode2utf8(&unicode::iso2unicode ($1, $text)); 
    349     return; 
    350     } 
    351  
    352     if ($self->{'encoding'} =~ /windows_(\d{4})$/) { 
    353     undef $/; 
    354     my $text = <$handle>; 
    355     $/ = "\n"; 
    356     $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ($1, $text)); 
    357     return; 
    358     } 
    359  
    360     if ($self->{'encoding'} =~ /^koi8_[ru]$/) { 
    361     undef $/; 
    362     my $text = <$handle>; 
    363     $/ = "\n"; 
    364     $$outputref .= &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $text)); 
    365     return; 
    366     } 
     311 
     312    if ($self->{'encoding'} eq "shift_jis") { 
     313    undef $/; 
     314    my $text = <$handle>; 
     315    $/ = "\n"; 
     316    $$outputref .= &unicode::unicode2utf8(&cjk::sjis2unicode ($text)); 
     317    return; 
     318    } 
     319 
     320    if ($self->{'encoding'} eq "euc_jp") { 
     321    undef $/; 
     322    my $text = <$handle>; 
     323    $/ = "\n"; 
     324    $$outputref .= &unicode::unicode2utf8(&cjk::eucjp2unicode ($text)); 
     325    return; 
     326    } 
     327 
     328    if ($self->{'encoding'} eq "euc_kr") { 
     329    undef $/; 
     330    my $text = <$handle>; 
     331    $/ = "\n"; 
     332    $$outputref .= &unicode::unicode2utf8(&cjk::euckr2unicode ($text)); 
     333    return; 
     334    } 
     335 
     336    if ($self->{'encoding'} eq "uhc") { 
     337    undef $/; 
     338    my $text = <$handle>; 
     339    $/ = "\n"; 
     340    $$outputref .= &unicode::unicode2utf8(&cjk::uhc2unicode ($text)); 
     341    return; 
     342    } 
     343 
     344    # if we get to here we assume it's a simple 8 bit encoding 
     345    undef $/; 
     346    my $text = <$handle>; 
     347    $/ = "\n"; 
     348    $$outputref .= &unicode::unicode2utf8(&unicode::simple2unicode ($self->{'encoding'}, $text)); 
    367349} 
    368350 
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r1857 r1868  
    4040%supported_encodings = ( 
    4141            "ascii" => "", 
     42            "utf8" => "", 
    4243            "iso_8859_1" => "", 
    4344            "windows_1252" => "", 
     
    5859            "iso_8859_9" => "", 
    5960            "windows_1254" => "", 
    60             "gb" => "" 
     61            "gb" => "", 
     62            "iscii_de" => "", 
     63            "windows_1257" => "", 
     64            "windows_874" => "", 
     65            "windows_1258" => "", 
     66            "shift_jis" => "", 
     67            "euc_jp" => "", 
     68            "uhc" => "" 
    6169            ); 
    6270 
     
    128136    print STDERR "                       windows_1254: Windows codepage 1254 (WinTurkish)\n"; 
    129137 
    130     print STDERR "                       gb: GB or GBK simplified Chinese\n\n"; 
     138    print STDERR "                       gb: GB or GBK simplified Chinese\n"; 
     139 
     140    print STDERR "                       iscii_de: ISCII Devanagari\n"; 
     141 
     142    print STDERR "                       windows_1257: Windows codepage 1257 (WinBaltic)\n"; 
     143 
     144    print STDERR "                       windows_874: Windows codepage 874 (Thai)\n"; 
     145 
     146    print STDERR "                       windows_1258: Windows codepage 1258 (Vietnamese)\n"; 
     147 
     148    print STDERR "                       shift_jis: Shift-JIS (Japanese)\n"; 
     149    print STDERR "                       euc_jp: EUC encoded Japanese\n"; 
     150 
     151    print STDERR "                       uhc: Unified Hangul Code (Korean). This is a superset of\n"; 
     152    print STDERR "                            EUC encoded Korean\n\n"; 
     153 
    131154 
    132155    print STDERR "   -default_encoding If -input_encoding is set to 'auto' and the text categorization\n"; 
     
    144167    print STDERR "                     this value.\n\n"; 
    145168 
    146     print STDERR "   -extract_acronyms Extract acronyms from within text and set as metadata\n\n"; 
     169    print STDERR "   -extract_acronyms Extract acronyms from within text and set as metadata\n"; 
    147170 
    148171    print STDERR "   -markup_acronyms  Add acronym metadata into document text\n\n"; 
     
    153176    print STDERR "   -extract_email    Extract email addresses as metadata\n\n"; 
    154177 
    155     print STDERR "   -extract_date     Extract dates pertaining to the content of documents about history\n\n"; 
    156     print STDERR "   -maximum_date     The maximum historical date to be used as metadata (in a Common Era date such as 1950)\n\n"; 
    157     print STDERR "   -maximum_century  The maximum named ceuntury to be extracted as historical metadata (e.g. 14 will extract all references up to the 14th century)\n\n"; 
    158     print STDERR "   -no_bibliography Do not try and block pbibliographic dates when extracting historical dates.\n\n";  
     178    print STDERR "   -extract_date     Extract dates pertaining to the content of documents about history\n"; 
     179    print STDERR "   -maximum_date     The maximum historical date to be used as metadata (in a Common Era\n"; 
     180    print STDERR "                     date such as 1950)\n"; 
     181    print STDERR "   -maximum_century  The maximum named century to be extracted as historical metadata\n"; 
     182    print STDERR "                     (e.g. 14 will extract all references up to the 14th century)\n"; 
     183    print STDERR "   -no_bibliography  Do not try and block bibliographic dates when extracting historical dates.\n\n";  
    159184} 
    160185 
     
    163188sub print_usage { 
    164189    print STDERR "\nThis plugin has no plugin specific options\n\n"; 
    165  
    166190} 
    167191 
     
    173197    my $enc = "^("; 
    174198    map {$enc .= "|$_";} keys %supported_encodings; 
    175     my $denc = $enc . "|utf8|unicode)\$"; 
    176     $enc .= "|utf8|unicode|auto)\$"; 
     199    my $denc = $enc . "|unicode)\$"; 
     200    $enc .= "|unicode|auto)\$"; 
    177201     
    178202    $self->{'outhandle'} = STDERR; 
     
    321345    my $doc_obj = new doc ($filename, "indexed_doc"); 
    322346    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); 
    323     $doc_obj->set_source_encoding ($encoding); 
    324      
     347    $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); 
    325348     
    326349    # read in file ($text will be in utf8) 
     
    426449 
    427450    if (scalar @results != 1) { 
     451     
    428452    if ($self->{'input_encoding'} ne 'auto') { 
    429453        if ($self->{'extract_language'} && $self->{'verbosity'}) { 
     
    444468    # format language/encoding 
    445469    my ($language, $encoding) = $results[0] =~ /^([^-]*)(?:-(.*))?$/; 
    446     $language = $iso639::toiso639{lc($language)}; 
    447470    die "Invalid language\n" if !defined $language; 
    448471 
     
    450473    # if textcat returned no encoding info it is assumed to be iso_8859_1 
    451474    $encoding = "iso_8859_1"; 
    452     } else { 
    453     # convert to the format we expect 
    454     $encoding =~ s/windows/windows_/; 
    455     $encoding =~ s/iso8859/iso_8859/; 
    456     $encoding =~ s/^gb.*$/gb/; 
    457475    } 
    458476 
  • trunk/gsdl/perllib/unicode.pm

    r1844 r1868  
    133133} 
    134134 
     135# iscii2unicode is basically identical to iso2unicode, the only 
     136# difference being that the map files live in unicode/MAPPINGS/ISCII 
     137# 
     138# values for $encoding may be 'Devanagari' only at present 
     139sub iscii2unicode { 
     140    my ($encoding, $in) = @_; 
     141    my $out = []; 
     142 
     143    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",  
     144                      "ISCII", "$encoding.txt");  
     145    return $out unless &loadmapping ($encoding, $mapfile); 
     146 
     147    my $i = 0; 
     148    my $len = length($in); 
     149    while ($i < $len) { 
     150    my $c = ord(substr ($in, $i, 1)); 
     151    $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0); 
     152    push (@$out, $c); 
     153    $i++; 
     154    } 
     155 
     156    return $out; 
     157} 
    135158 
    136159# ascii2utf8 takes a (extended) ascii string and 
     
    169192     
    170193    foreach $num (@$in) { 
     194    next unless defined $num; 
    171195    if ($num < 0x80) { 
    172196        $out .= chr ($num); 
     
    326350 
    327351 
     352 
     353 
     354 
     355 
     356 
     357 
     358 
     359#################################################################################################### 
     360 
     361 
     362# %translations is of the form: 
     363# 
     364# encodings{encodingname-encodingname}->blocktranslation 
     365# blocktranslation->[[0-255],[256-511], ..., [65280-65535]] 
     366# 
     367# Any of the top translation blocks can point to an undefined 
     368# value. This data structure aims to allow fast translation and  
     369# efficient storage. 
     370%translations = (); 
     371 
     372# @array256 is used for initialisation, there must be 
     373# a better way... 
     374@array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
     375         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
     376         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
     377         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
     378         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
     379         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
     380         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
     381         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
     382         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
     383         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
     384         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
     385         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
     386         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
     387         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
     388         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 
     389         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); 
     390 
     391$encodings = { 
     392    'iso_8859_1' => {'fullname' => 'Latin1 (western languages)', 
     393             'mapfile' => '8859_1.ump', 'ascii_delim' => 0xA0}, 
     394 
     395    'iso_8859_2' => {'fullname' => 'Latin2 (central and eastern european languages)', 
     396             'mapfile' => '8859_2.ump', 'ascii_delim' => 0xA0}, 
     397 
     398    'iso_8859_3' => {'fullname' => 'Latin3', 
     399             'mapfile' => '8859_3.ump', 'ascii_delim' => 0xA0}, 
     400 
     401    'iso_8859_4' => {'fullname' => 'Latin4', 
     402             'mapfile' => '8859_4.ump', 'ascii_delim' => 0xA0}, 
     403 
     404    'iso_8859_5' => {'fullname' => 'Cyrillic', 
     405             'mapfile' => '8859_5.ump', 'ascii_delim' => 0xA0}, 
     406 
     407    'iso_8859_6' => {'fullname' => 'Arabic', 
     408             'mapfile' => '8859_6.ump', 'ascii_delim' => 0xA0}, 
     409 
     410    'iso_8859_7' => {'fullname' => 'Greek',  
     411             'mapfile' => '8859_7.ump', 'ascii_delim' => 0xA0}, 
     412 
     413    'iso_8859_8' => {'fullname' => 'Hebrew',  
     414             'mapfile' => '8859_8.ump', 'ascii_delim' => 0xA0}, 
     415 
     416    'iso_8859_9' => {'fullname' => 'Latin5', 
     417             'mapfile' => '8859_9.ump', 'ascii_delim' => 0xA0}, 
     418 
     419    'windows_1250' => {'fullname' => 'Windows codepage 1250 (WinLatin2)', 
     420               'mapfile' => 'win1250.ump', 'ascii_delim' => 0x80}, 
     421 
     422    'windows_1251' => {'fullname' => 'Windows codepage 1251 (WinCyrillic)', 
     423               'mapfile' => 'win1251.ump', 'ascii_delim' => 0x80}, 
     424 
     425    'windows_1252' => {'fullname' => 'Windows codepage 1252 (WinLatin1)', 
     426               'mapfile' => 'win1252.ump', 'ascii_delim' => 0x80}, 
     427 
     428    'windows_1253' => {'fullname' => 'Windows codepage 1253 (WinGreek)',  
     429               'mapfile' => 'win1253.ump', 'ascii_delim' => 0x80}, 
     430 
     431    'windows_1254' => {'fullname' => 'Windows codepage 1254 (WinTurkish)', 
     432               'mapfile' => 'win1254.ump', 'ascii_delim' => 0x80}, 
     433 
     434    'windows_1255' => {'fullname' => 'Windows codepage 1255 (WinHebrew)',  
     435               'mapfile' => 'win1255.ump', 'ascii_delim' => 0x80}, 
     436 
     437    'windows_1256' => {'fullname' => 'Windows codepage 1256 (WinArabic)',  
     438               'mapfile' => 'win1256.ump', 'ascii_delim' => 0x80}, 
     439 
     440    'windows_1257' => {'fullname' => 'Windows codepage 1257 (WinBaltic)', 
     441               'mapfile' => 'win1257.ump', 'ascii_delim' => 0x80}, 
     442 
     443    'windows_1258' => {'fullname' => 'Windows codepage 1258 (Vietnamese)', 
     444               'mapfile' => 'win1258.ump', 'ascii_delim' => 0x80}, 
     445 
     446    'windows_874' => {'fullname' => 'Windows codepage 874 (Thai)', 
     447              'mapfile' => 'win874.ump', 'ascii_delim' => 0x80}, 
     448 
     449    'koi8_r' => {'fullname' => 'Cyrillic', 
     450         'mapfile' => 'koi8_r.ump', 'ascii_delim' => 0x80}, 
     451 
     452    'koi8_u' => {'fullname' => 'Cyrillic (Ukrainian)', 
     453         'mapfile' => 'koi8_u.ump', 'ascii_delim' => 0x80}, 
     454 
     455    'iscii_de' => {'fullname' => 'ISCII Devanagari', 
     456           'mapfile' => 'iscii_de.ump', 'ascii_delim' => 0xA0} 
     457}; 
     458 
     459# returns a pointer to unicode array 
     460sub simple2unicode { 
     461    my ($encoding, $intext) = @_; 
     462 
     463    if (!defined ($encodings->{$encoding})) { 
     464    print STDERR "unicode::simple2unicode: ERROR: $encoding encoding not supported\n"; 
     465    return []; 
     466    } 
     467 
     468    my $info = $encodings->{$encoding}; 
     469    my $encodename = "$encoding-unicode"; 
     470    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings", "to_uc", 
     471                      $info->{'mapfile'}); 
     472 
     473    if (!&loadmapencoding ($encodename, $mapfile)) { 
     474    print STDERR "unicode: ERROR - could not load encoding $encodename\n"; 
     475    return []; 
     476    } 
     477     
     478    my @outtext = (); 
     479    my $len = length($intext); 
     480    my ($c); 
     481    my $i = 0; 
     482 
     483    while ($i < $len) { 
     484    if (($c = ord(substr($intext, $i, 1))) < $info->{'ascii_delim'}) { 
     485        # normal ascii character 
     486        push (@outtext, $c); 
     487    } else { 
     488        push (@outtext, &transchar ($encodename, $c)); 
     489    } 
     490    $i ++; 
     491    } 
     492    return \@outtext; 
     493} 
     494 
     495# returns 1 if successful, 0 if unsuccessful 
     496sub loadmapencoding { 
     497    my ($encoding, $mapfile) = @_; 
     498     
     499    # check to see if the encoding has already been loaded 
     500    return 1 if (defined $translations{$encoding}); 
     501 
     502    return 0 unless open (MAPFILE, $mapfile); 
     503    binmode (MAPFILE); 
     504 
     505    $translations{$encoding} = [@array256]; 
     506    my $block = $translations{$encoding}; 
     507 
     508    my ($in,$i,$j); 
     509    while (read(MAPFILE, $in, 1) == 1) { 
     510    $i = unpack ("C", $in); 
     511    $block->[$i] = [@array256]; 
     512    for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) { 
     513        my ($n1, $n2) = unpack ("CC", $in); 
     514        $block->[$i]->[$j] = ($n1*256) + $n2; 
     515    } 
     516    } 
     517 
     518    close (MAPFILE); 
     519} 
     520 
     521sub transchar { 
     522    my ($encoding, $from) = @_; 
     523    my $high = ($from / 256) % 256; 
     524    my $low = $from % 256; 
     525 
     526    return 0 unless defined $translations{$encoding}; 
     527 
     528    my $block = $translations{$encoding}; 
     529 
     530    if (ref ($block->[$high]) ne "ARRAY") { 
     531    return 0; 
     532    } 
     533    return $block->[$high]->[$low]; 
     534} 
     535 
     536 
     537 
     538 
    3285391; 
     540 
  • trunk/gsdl/unicode/MAPPINGS/ISCII/Devanagari.txt

    r1522 r1868  
    11#  ISCII / IS 13194:1991 
    2  
    3 # This table was generated by Stuart (say1@cs.waikato.ac.nz) for 
    4 # the Greenstone Digital Library software from the ISCII (Indian  
    5 # Script Code for Information Interchange). It maps from the  
    6 # ISCII 7 bit code page covering Latin and Indian Scripts to  
    7 # the Unicode 0900-907F range. 
    82 
    93# see Unicode Standard Version 2.0 pages 7-72  
     
    115 
    126#LETTERS 
    13 0xA1    0x0901  # DEVANAGARI VOWEL-MODIFIER CHANDRABINDU 
    14 0xA2    0x0902  # DEVANAGARI VOWEL-MODIFIER ANUSWAR 
    15 0xA3    0x0903  # DEVANAGARI VOWEL-MODIFIER VISARG 
     70xA1    0x0901  # DEVANAGARI VOWEL-MODIFIER CHANDRABINDU 
     80xA2    0x0902  # DEVANAGARI VOWEL-MODIFIER ANUSWAR 
     90xA3    0x0903  # DEVANAGARI VOWEL-MODIFIER VISARG 
    1610 
    17 0xA4    0x0905  # DEVANAGARI VOWEL A 
    18 0xA5    0x0906  # DEVANAGARI VOWEL AA 
    19 0xA6    0x0907  # DEVANAGARI VOWEL I 
    20 0xA7    0x0908  # DEVANAGARI VOWEL II 
    21 0xA8    0x0909  # DEVANAGARI VOWEL U 
    22 0xA9    0x090A  # DEVANAGARI VOWEL UU 
    23 0xAA    0x090B  # DEVANAGARI VOWEL RI 
    24 0xAB    0x090E  # DEVANAGARI VOWEL E (SOUTHERN SCRIPTS) 
    25 0xAC    0x090F  # DEVANAGARI VOWEL EY 
    26 0xAD    0x0910  # DEVANAGARI VOWEL AI 
    27 0xAE    0x090D  # DEVANAGARI VOWEL AYE (DEVANAGARI SCRIPT) 
    28 0xAF    0x0912  # DEVANAGARI VOWEL O (SOUTHERN SCRIPTS) 
     110xA4    0x0905  # DEVANAGARI VOWEL A 
     120xA5    0x0906  # DEVANAGARI VOWEL AA 
     130xA6    0x0907  # DEVANAGARI VOWEL I 
     140xA7    0x0908  # DEVANAGARI VOWEL II 
     150xA8    0x0909  # DEVANAGARI VOWEL U 
     160xA9    0x090A  # DEVANAGARI VOWEL UU 
     170xAA    0x090B  # DEVANAGARI VOWEL RI 
     180xAB    0x090E  # DEVANAGARI VOWEL E (SOUTHERN SCRIPTS) 
     190xAC    0x090F  # DEVANAGARI VOWEL EY 
     200xAD    0x0910  # DEVANAGARI VOWEL AI 
     210xAE    0x090D  # DEVANAGARI VOWEL AYE (DEVANAGARI SCRIPT) 
     220xAF    0x0912  # DEVANAGARI VOWEL O (SOUTHERN SCRIPTS) 
    2923 
    30 0xB0    0x0913  # DEVANAGARI VOWEL OW 
    31 0xB1    0x0914  # DEVANAGARI VOWEL AU 
    32 0xB2    0x0911  # DEVANAGARI VOWEL AWE  (DEVANAGARI SCRIPT) 
    33 0xB3    0x0915  # DEVANAGARI CONSONANT KA 
    34 0xB4    0x0916  # DEVANAGARI CONSONANT KHA 
    35 0xB5    0x0917  # DEVANAGARI CONSONANT GA 
    36 0xB6    0x0918  # DEVANAGARI CONSONANT GHA 
    37 0xB7    0x0919  # DEVANAGARI CONSONANT NGA 
    38 0xB8    0x091A  # DEVANAGARI CONSONANT CHA 
    39 0xB9    0x091B  # DEVANAGARI CONSONANT CHHA 
    40 0xBA    0x091C  # DEVANAGARI CONSONANT JA 
    41 0xBB    0x091D  # DEVANAGARI CONSONANT JHA 
    42 0xBC    0x091E  # DEVANAGARI CONSONANT JNA 
    43 0xBD    0x091F  # DEVANAGARI CONSONANT HARD TA 
    44 0xBE    0x0920  # DEVANAGARI CONSONANT HARD THA 
    45 0xBF    0x0921  # DEVANAGARI CONSONANT HARD DA 
     240xB0    0x0913  # DEVANAGARI VOWEL OW 
     250xB1    0x0914  # DEVANAGARI VOWEL AU 
     260xB2    0x0911  # DEVANAGARI VOWEL AWE  (DEVANAGARI SCRIPT) 
     270xB3    0x0915  # DEVANAGARI CONSONANT KA 
     280xB4    0x0916  # DEVANAGARI CONSONANT KHA 
     290xB5    0x0917  # DEVANAGARI CONSONANT GA 
     300xB6    0x0918  # DEVANAGARI CONSONANT GHA 
     310xB7    0x0919  # DEVANAGARI CONSONANT NGA 
     320xB8    0x091A  # DEVANAGARI CONSONANT CHA 
     330xB9    0x091B  # DEVANAGARI CONSONANT CHHA 
     340xBA    0x091C  # DEVANAGARI CONSONANT JA 
     350xBB    0x091D  # DEVANAGARI CONSONANT JHA 
     360xBC    0x091E  # DEVANAGARI CONSONANT JNA 
     370xBD    0x091F  # DEVANAGARI CONSONANT HARD TA 
     380xBE    0x0920  # DEVANAGARI CONSONANT HARD THA 
     390xBF    0x0921  # DEVANAGARI CONSONANT HARD DA 
    4640 
    47 0xC0    0x0922  # DEVANAGARI CONSONANT HARD DHA 
    48 0xC1    0x0923  # DEVANAGARI CONSONANT HARD NA 
    49 0xC2    0x0924  # DEVANAGARI CONSONANT SOFT TA 
    50 0xC3    0x0925  # DEVANAGARI CONSONANT SOFT THA 
    51 0xC4    0x0926  # DEVANAGARI CONSONANT SOFT DA 
    52 0xC5    0x0927  # DEVANAGARI CONSONANT SOFT DHA 
    53 0xC6    0x0928  # DEVANAGARI CONSONANT SOFT NA 
    54 0xC7    0x0929  # DEVANAGARI CONSONANT NA (TAMIL) 
    55 0xC8    0x092A  # DEVANAGARI CONSONANT PA 
    56 0xC9    0x092B  # DEVANAGARI CONSONANT PHA 
    57 0xCA    0x092C  # DEVANAGARI CONSONANT BA 
    58 0xCB    0x092D  # DEVANAGARI CONSONANT BHA 
    59 0xCC    0x092E  # DEVANAGARI CONSONANT MA 
    60 0xCD    0x092F  # DEVANAGARI CONSONANT YA 
     410xC0    0x0922  # DEVANAGARI CONSONANT HARD DHA 
     420xC1    0x0923  # DEVANAGARI CONSONANT HARD NA 
     430xC2    0x0924  # DEVANAGARI CONSONANT SOFT TA 
     440xC3    0x0925  # DEVANAGARI CONSONANT SOFT THA 
     450xC4    0x0926  # DEVANAGARI CONSONANT SOFT DA 
     460xC5    0x0927  # DEVANAGARI CONSONANT SOFT DHA 
     470xC6    0x0928  # DEVANAGARI CONSONANT SOFT NA 
     480xC7    0x0929  # DEVANAGARI CONSONANT NA (TAMIL) 
     490xC8    0x092A  # DEVANAGARI CONSONANT PA 
     500xC9    0x092B  # DEVANAGARI CONSONANT PHA 
     510xCA    0x092C  # DEVANAGARI CONSONANT BA 
     520xCB    0x092D  # DEVANAGARI CONSONANT BHA 
     530xCC    0x092E  # DEVANAGARI CONSONANT MA 
     540xCD    0x092F  # DEVANAGARI CONSONANT YA 
    6155# WARNING: THIS CHARACTER IS NON-CANNONICAL 
    62 0xCE    0x095F  # DEVANAGARI CONSONANT JKA (BENGALI, ASSAMESE & ORIYA) 
    63 0xCF    0x0930  # DEVANAGARI CONSONANT RA 
     560xCE    0x095F  # DEVANAGARI CONSONANT JKA (BENGALI, ASSAMESE & ORIYA) 
     570xCF    0x0930  # DEVANAGARI CONSONANT RA 
    6458 
    65 0xD0    0x0931  # DEVANAGARI CONSONANT HARD RA (SOUTHERN SCRIPTS) 
    66 0xD1    0x0932  # DEVANAGARI CONSONANT LA 
    67 0xD2    0x0933  # DEVANAGARI CONSONANT HARD LA 
    68 0xD3    0x0934  # DEVANAGARI CONSONANT ZHA (TAMIL & MALAYALAM) 
    69 0xD4    0x0935  # DEVANAGARI CONSONANT VA 
    70 0xD5    0x0936  # DEVANAGARI CONSONANT SHA 
    71 0xD6    0x0937  # DEVANAGARI CONSONANT HARD SHA 
    72 0xD7    0x0938  # DEVANAGARI CONSONANT SA 
    73 0xD8    0x0939  # DEVANAGARI CONSONANT HA 
    74 #0xD9    0x0900  # DEVANAGARI INVISIBLE (NO UNICODE EQUALIVENT) 
    75 0xDA    0x093E  # DEVANAGARI VOWEL SIGN AA 
    76 0xDB    0x093F  # DEVANAGARI VOWEL SIGN I 
    77 0xDC    0x0940  # DEVANAGARI VOWEL SIGN II 
    78 0xDD    0x0941  # DEVANAGARI VOWEL SIGN U 
    79 0xDE    0x0942  # DEVANAGARI VOWEL SIGN UU 
    80 0xDF    0x0943  # DEVANAGARI VOWEL SIGN RI 
     590xD0    0x0931  # DEVANAGARI CONSONANT HARD RA (SOUTHERN SCRIPTS) 
     600xD1    0x0932  # DEVANAGARI CONSONANT LA 
     610xD2    0x0933  # DEVANAGARI CONSONANT HARD LA 
     620xD3    0x0934  # DEVANAGARI CONSONANT ZHA (TAMIL & MALAYALAM) 
     630xD4    0x0935  # DEVANAGARI CONSONANT VA 
     640xD5    0x0936  # DEVANAGARI CONSONANT SHA 
     650xD6    0x0937  # DEVANAGARI CONSONANT HARD SHA 
     660xD7    0x0938  # DEVANAGARI CONSONANT SA 
     670xD8    0x0939  # DEVANAGARI CONSONANT HA 
     68#0xD9   0x0900  # DEVANAGARI INVISIBLE (NO UNICODE EQUALIVENT) 
     690xDA    0x093E  # DEVANAGARI VOWEL SIGN AA 
     700xDB    0x093F  # DEVANAGARI VOWEL SIGN I 
     710xDC    0x0940  # DEVANAGARI VOWEL SIGN II 
     720xDD    0x0941  # DEVANAGARI VOWEL SIGN U 
     730xDE    0x0942  # DEVANAGARI VOWEL SIGN UU 
     740xDF    0x0943  # DEVANAGARI VOWEL SIGN RI 
    8175 
    82 0xE0    0x0946  # DEVANAGARI VOWEL SIGN E (SOUTHERN SCRIPTS) 
    83 0xE1    0x0947  # DEVANAGARI VOWEL SIGN EY 
    84 0xE2    0x0948  # DEVANAGARI VOWEL SIGN AI 
    85 0xE3    0x0945  # DEVANAGARI VOWEL SIGN AYE (DEVANAGARI SCRIPT) 
    86 0xE4    0x094A  # DEVANAGARI VOWEL SIGN O SOUTHERN SCRIPTS) 
    87 0xE5    0x094B  # DEVANAGARI VOWEL SIGN OW 
    88 0xE6    0x094C  # DEVANAGARI VOWEL SIGN AU 
    89 0xE7    0x0949  # DEVANAGARI VOWEL SIGN AWE (DEVANAGARI SCRIPT) 
    90 0xE8    0x094D  # DEVANAGARI VOWEL SIGN OMISSION SIGN (HALANT) 
     760xE0    0x0946  # DEVANAGARI VOWEL SIGN E (SOUTHERN SCRIPTS) 
     770xE1    0x0947  # DEVANAGARI VOWEL SIGN EY 
     780xE2    0x0948  # DEVANAGARI VOWEL SIGN AI 
     790xE3    0x0945  # DEVANAGARI VOWEL SIGN AYE (DEVANAGARI SCRIPT) 
     800xE4    0x094A  # DEVANAGARI VOWEL SIGN O SOUTHERN SCRIPTS) 
     810xE5    0x094B  # DEVANAGARI VOWEL SIGN OW 
     820xE6    0x094C  # DEVANAGARI VOWEL SIGN AU 
     830xE7    0x0949  # DEVANAGARI VOWEL SIGN AWE (DEVANAGARI SCRIPT) 
     840xE8    0x094D  # DEVANAGARI VOWEL SIGN OMISSION SIGN (HALANT) 
    9185 
    9286#PUNCTUATION 
    93 0xE9    0x093C  # DEVANAGARI DIACRITIC SIGN (NUKTA) 
    94 0xEA    0x0964  # DEVANAGARI FULL STOP 
     870xE9    0x093C  # DEVANAGARI DIACRITIC SIGN (NUKTA) 
     880xEA    0x0964  # DEVANAGARI FULL STOP 
    9589 
    9690#DIGITS 
    97 0xF1    0x0966  # DEVANAGARI DIGIT ZERO 
    98 0xF2    0x0967  # DEVANAGARI DIGIT ONE 
    99 0xF3    0x0968  # DEVANAGARI DIGIT TWO 
    100 0xF4    0x0969  # DEVANAGARI DIGIT THREE 
    101 0xF5    0x096A  # DEVANAGARI DIGIT FOUR 
    102 0xF6    0x096B  # DEVANAGARI DIGIT FIVE 
    103 0xF7    0x096C  # DEVANAGARI DIGIT SIX 
    104 0xF8    0x096D  # DEVANAGARI DIGIT SEVEN 
    105 0xF9    0x096E  # DEVANAGARI DIGIT EIGHT 
    106 0xFA    0x096F  # DEVANAGARI DIGIT NINE 
     910xF1    0x0966  # DEVANAGARI DIGIT ZERO 
     920xF2    0x0967  # DEVANAGARI DIGIT ONE 
     930xF3    0x0968  # DEVANAGARI DIGIT TWO 
     940xF4    0x0969  # DEVANAGARI DIGIT THREE 
     950xF5    0x096A  # DEVANAGARI DIGIT FOUR 
     960xF6    0x096B  # DEVANAGARI DIGIT FIVE 
     970xF7    0x096C  # DEVANAGARI DIGIT SIX 
     980xF8    0x096D  # DEVANAGARI DIGIT SEVEN 
     990xF9    0x096E  # DEVANAGARI DIGIT EIGHT 
     1000xFA    0x096F  # DEVANAGARI DIGIT NINE