Changeset 1868
- Timestamp:
- 2001-01-26T17:25:49+13:00 (23 years ago)
- Location:
- trunk/gsdl
- Files:
-
- 100 added
- 35 deleted
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/etc/main.cfg
r1856 r1868 84 84 # longname -- The display name of the given encoding. If longname isn't set 85 85 # it will default to using shortname instead. 86 87 86 # type -- The type of encoding. Note that for most encodings this 88 87 # value is the directory name under which the map file for 89 88 # this encoding resides in the Greenstone unicode/MAPPINGS 90 89 # directory (e.g. 'WINDOWS', 'ISO_8859' etc.). It may also 91 # take the values ' GB' and 'UTF8'.90 # take the values 'CJK' and 'UTF8'. 92 91 # mapfile -- The name of the map file for use when converting between 93 92 # utf8 and the given encoding. The mapfile option is mandatory 94 # for all encoding types with the exception of GB and UTF8. 93 # for all encoding types with the exception of UTF8. If type 94 # is CJK, mapfile is the abbreviated name of the encoding as 95 # used by the binary mapping files (.ump files). i.e. if the 96 # encoding uses the map files gbku.ump and ugbk.ump, mapfile 97 # will be set to "gbk". 95 98 # label -- The standard label to which you must set the value of 96 99 # "charset" within http headers or html meta tags to get a web … … 101 104 Encoding shortname=w1251 "longname=Cyrillic (Windows-1251)" type=WINDOWS mapfile=1251.TXT label=windows-1251 102 105 Encoding shortname=w1256 "longname=Arabic (Windows-1256)" type=WINDOWS mapfile=1256.TXT label=windows-1256 103 Encoding shortname=gb "longname=Simplified Chinese (GBK)" type=GB label=GBK 106 Encoding shortname=w1256 "longname=Central European (Windows-1250)" type=WINDOWS mapfile=1250.TXT label=windows-1250 107 Encoding shortname=gb "longname=Chinese Simplified (GBK)" type=CJK label=GBK mapfile=gbk 108 Encoding shortname=sjis "longname=Japanese (Shift-JIS)" type=CJK label=shift_jis mapfile=sjis 104 109 Encoding shortname=koi8r "longname=Cyrillic (KOI8-R)" type=CYRILLIC mapfile=koi8_r.txt label=koi8-r 110 111 # The following encoding is not currently supported 112 # Encoding shortname=eucjp "longname=Japanese (EUC)" type=CJK label=euc-jp mapfile=jis 105 113 106 114 -
trunk/gsdl/perllib/doc.pm
r1844 r1868 130 130 } 131 131 132 sub set_source_encoding {133 my $self = shift (@_);134 my ($source_encoding) = @_;135 136 $self->set_metadata_element ($self->get_top_section(),137 "gsdlsourceencoding",138 $source_encoding);139 }140 141 # returns the source_encoding as it was provided142 sub get_source_encoding {143 my $self = shift (@_);144 145 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourceencoding");146 }147 148 132 sub _escape_text { 149 133 my ($text) = @_; -
trunk/gsdl/perllib/multiread.pm
r1844 r1868 30 30 # gb - GB 31 31 # iso_8859_[1-9] - 8 bit extended ascii encodings 32 # windows_125[0-6] - Windows codepages 1250 to 1256 32 # windows_125[0-8] - Windows codepages 1250 to 1258 33 # windows 874 - Windows codepage 874 34 # iscii_de - ISCII Devanagari 35 # shift_jis - Shift-JIS 36 # euc_jp - EUC encoded Japanese 37 # uhc - Unified Hangul Code (Korean) 33 38 34 39 package multiread; 35 40 36 41 use unicode; 37 use gb;42 use cjk; 38 43 39 44 sub new { … … 73 78 # if automatic detection between utf8 and unicode is desired 74 79 # then the encoding should be initially set to utf8 75 sub read_ char {80 sub read_unicode_char { 76 81 my $self = shift (@_); 77 82 … … 79 84 return undef if ($self->{'handle'} eq ""); 80 85 my $handle = $self->{'handle'}; 86 binmode ($handle); 81 87 82 88 if ($self->{'encoding'} eq "utf8") { … … 99 105 $self->{'encoding'} = "unicode"; 100 106 $self->{'bigendian'} = 0; 101 if ($ENV{'GSDLOS'} =~ /windows/i) {102 binmode ($handle); # silly windows103 }104 107 last; 105 108 … … 107 110 $self->{'encoding'} = "unicode"; 108 111 $self->{'bigendian'} = 1; 109 if ($ENV{'GSDLOS'} =~ /windows/i) {110 binmode ($handle); # silly windows111 }112 112 last; 113 113 } … … 153 153 } 154 154 155 if ($self->{'encoding'} eq "gb") {156 # GB or GBK157 return undef if (eof ($handle));158 my $c1 = getc ($handle);159 if (ord ($c1) >= 0x81) {160 # double byte character161 return undef if (eof ($handle));162 my $c2 = getc ($handle);163 return &unicode::unicode2utf8 (&gb::gb2unicode ($c1.$c2));164 165 } else {166 # single byte character167 return &unicode::ascii2utf8 ($c1);168 }169 }170 171 if ($self->{'encoding'} eq "iso_8859_1") {172 # special case for iso_8859_1 as &ascii2utf8($char) is faster than173 # &unicode2utf8(iso2unicode('1', $char))174 return undef if (eof ($handle));175 return &unicode::ascii2utf8 (getc ($handle));176 }177 178 if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) {179 return undef if (eof ($handle));180 return &unicode::unicode2utf8(&unicode::iso2unicode ($1, getc($handle)));181 }182 183 if ($self->{'encoding'} =~ /windows_(\d{4})$/) {184 return undef if (eof ($handle));185 return &unicode::unicode2utf8(&unicode::windows2unicode ($1, getc($handle)));186 }187 188 if ($self->{'encoding'} =~ /^koi8_[ru]$/) {189 return undef if (eof ($handle));190 return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, getc($handle)));191 }192 193 # unknown encoding194 155 return undef; 195 156 } … … 211 172 my $out = ""; 212 173 my $thisc = ""; 213 while (defined ($thisc = $self->read_ char())) {174 while (defined ($thisc = $self->read_unicode_char())) { 214 175 $out .= $thisc; 215 176 last if ($thisc eq "\n"); … … 219 180 return undef; 220 181 } 221 222 182 223 183 if ($self->{'encoding'} eq "utf8") { … … 247 207 my $line = ""; 248 208 if (defined ($line = <$handle>)) { 249 return &unicode::unicode2utf8 (& gb::gb2unicode ($line));209 return &unicode::unicode2utf8 (&cjk::gb2unicode ($line)); 250 210 } 251 211 return undef; … … 270 230 } 271 231 272 if ($self->{'encoding'} =~ /windows_(\d{ 4})$/) {232 if ($self->{'encoding'} =~ /windows_(\d{3,4})$/) { 273 233 my $line = ""; 274 234 if (defined ($line = <$handle>)) { … … 282 242 if (defined ($line = <$handle>)) { 283 243 return &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $line)); 244 } 245 return undef; 246 } 247 248 if ($self->{'encoding'} eq "iscii_de") { 249 my $line = ""; 250 if (defined ($line = <$handle>)) { 251 return &unicode::unicode2utf8(&unicode::iscii2unicode ("Devanagari", $line)); 284 252 } 285 253 return undef; … … 328 296 my $text = <$handle>; 329 297 $/ = "\n"; 330 $$outputref .= &unicode::unicode2utf8 (& gb::gb2unicode ($text));298 $$outputref .= &unicode::unicode2utf8 (&cjk::gb2unicode ($text)); 331 299 return; 332 300 } … … 341 309 return; 342 310 } 343 344 if ($self->{'encoding'} =~ /^iso_8859_(\d+)$/) { 345 undef $/; 346 my $text = <$handle>; 347 $/ = "\n"; 348 $$outputref .= &unicode::unicode2utf8(&unicode::iso2unicode ($1, $text)); 349 return; 350 } 351 352 if ($self->{'encoding'} =~ /windows_(\d{4})$/) { 353 undef $/; 354 my $text = <$handle>; 355 $/ = "\n"; 356 $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ($1, $text)); 357 return; 358 } 359 360 if ($self->{'encoding'} =~ /^koi8_[ru]$/) { 361 undef $/; 362 my $text = <$handle>; 363 $/ = "\n"; 364 $$outputref .= &unicode::unicode2utf8(&unicode::cyrillic2unicode ($self->{'encoding'}, $text)); 365 return; 366 } 311 312 if ($self->{'encoding'} eq "shift_jis") { 313 undef $/; 314 my $text = <$handle>; 315 $/ = "\n"; 316 $$outputref .= &unicode::unicode2utf8(&cjk::sjis2unicode ($text)); 317 return; 318 } 319 320 if ($self->{'encoding'} eq "euc_jp") { 321 undef $/; 322 my $text = <$handle>; 323 $/ = "\n"; 324 $$outputref .= &unicode::unicode2utf8(&cjk::eucjp2unicode ($text)); 325 return; 326 } 327 328 if ($self->{'encoding'} eq "euc_kr") { 329 undef $/; 330 my $text = <$handle>; 331 $/ = "\n"; 332 $$outputref .= &unicode::unicode2utf8(&cjk::euckr2unicode ($text)); 333 return; 334 } 335 336 if ($self->{'encoding'} eq "uhc") { 337 undef $/; 338 my $text = <$handle>; 339 $/ = "\n"; 340 $$outputref .= &unicode::unicode2utf8(&cjk::uhc2unicode ($text)); 341 return; 342 } 343 344 # if we get to here we assume it's a simple 8 bit encoding 345 undef $/; 346 my $text = <$handle>; 347 $/ = "\n"; 348 $$outputref .= &unicode::unicode2utf8(&unicode::simple2unicode ($self->{'encoding'}, $text)); 367 349 } 368 350 -
trunk/gsdl/perllib/plugins/BasPlug.pm
r1857 r1868 40 40 %supported_encodings = ( 41 41 "ascii" => "", 42 "utf8" => "", 42 43 "iso_8859_1" => "", 43 44 "windows_1252" => "", … … 58 59 "iso_8859_9" => "", 59 60 "windows_1254" => "", 60 "gb" => "" 61 "gb" => "", 62 "iscii_de" => "", 63 "windows_1257" => "", 64 "windows_874" => "", 65 "windows_1258" => "", 66 "shift_jis" => "", 67 "euc_jp" => "", 68 "uhc" => "" 61 69 ); 62 70 … … 128 136 print STDERR " windows_1254: Windows codepage 1254 (WinTurkish)\n"; 129 137 130 print STDERR " gb: GB or GBK simplified Chinese\n\n"; 138 print STDERR " gb: GB or GBK simplified Chinese\n"; 139 140 print STDERR " iscii_de: ISCII Devanagari\n"; 141 142 print STDERR " windows_1257: Windows codepage 1257 (WinBaltic)\n"; 143 144 print STDERR " windows_874: Windows codepage 874 (Thai)\n"; 145 146 print STDERR " windows_1258: Windows codepage 1258 (Vietnamese)\n"; 147 148 print STDERR " shift_jis: Shift-JIS (Japanese)\n"; 149 print STDERR " euc_jp: EUC encoded Japanese\n"; 150 151 print STDERR " uhc: Unified Hangul Code (Korean). This is a superset of\n"; 152 print STDERR " EUC encoded Korean\n\n"; 153 131 154 132 155 print STDERR " -default_encoding If -input_encoding is set to 'auto' and the text categorization\n"; … … 144 167 print STDERR " this value.\n\n"; 145 168 146 print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n \n";169 print STDERR " -extract_acronyms Extract acronyms from within text and set as metadata\n"; 147 170 148 171 print STDERR " -markup_acronyms Add acronym metadata into document text\n\n"; … … 153 176 print STDERR " -extract_email Extract email addresses as metadata\n\n"; 154 177 155 print STDERR " -extract_date Extract dates pertaining to the content of documents about history\n\n"; 156 print STDERR " -maximum_date The maximum historical date to be used as metadata (in a Common Era date such as 1950)\n\n"; 157 print STDERR " -maximum_century The maximum named ceuntury to be extracted as historical metadata (e.g. 14 will extract all references up to the 14th century)\n\n"; 158 print STDERR " -no_bibliography Do not try and block pbibliographic dates when extracting historical dates.\n\n"; 178 print STDERR " -extract_date Extract dates pertaining to the content of documents about history\n"; 179 print STDERR " -maximum_date The maximum historical date to be used as metadata (in a Common Era\n"; 180 print STDERR " date such as 1950)\n"; 181 print STDERR " -maximum_century The maximum named century to be extracted as historical metadata\n"; 182 print STDERR " (e.g. 14 will extract all references up to the 14th century)\n"; 183 print STDERR " -no_bibliography Do not try and block bibliographic dates when extracting historical dates.\n\n"; 159 184 } 160 185 … … 163 188 sub print_usage { 164 189 print STDERR "\nThis plugin has no plugin specific options\n\n"; 165 166 190 } 167 191 … … 173 197 my $enc = "^("; 174 198 map {$enc .= "|$_";} keys %supported_encodings; 175 my $denc = $enc . "|u tf8|unicode)\$";176 $enc .= "|u tf8|unicode|auto)\$";199 my $denc = $enc . "|unicode)\$"; 200 $enc .= "|unicode|auto)\$"; 177 201 178 202 $self->{'outhandle'} = STDERR; … … 321 345 my $doc_obj = new doc ($filename, "indexed_doc"); 322 346 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language); 323 $doc_obj->set_source_encoding ($encoding); 324 347 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding); 325 348 326 349 # read in file ($text will be in utf8) … … 426 449 427 450 if (scalar @results != 1) { 451 428 452 if ($self->{'input_encoding'} ne 'auto') { 429 453 if ($self->{'extract_language'} && $self->{'verbosity'}) { … … 444 468 # format language/encoding 445 469 my ($language, $encoding) = $results[0] =~ /^([^-]*)(?:-(.*))?$/; 446 $language = $iso639::toiso639{lc($language)};447 470 die "Invalid language\n" if !defined $language; 448 471 … … 450 473 # if textcat returned no encoding info it is assumed to be iso_8859_1 451 474 $encoding = "iso_8859_1"; 452 } else {453 # convert to the format we expect454 $encoding =~ s/windows/windows_/;455 $encoding =~ s/iso8859/iso_8859/;456 $encoding =~ s/^gb.*$/gb/;457 475 } 458 476 -
trunk/gsdl/perllib/unicode.pm
r1844 r1868 133 133 } 134 134 135 # iscii2unicode is basically identical to iso2unicode, the only 136 # difference being that the map files live in unicode/MAPPINGS/ISCII 137 # 138 # values for $encoding may be 'Devanagari' only at present 139 sub iscii2unicode { 140 my ($encoding, $in) = @_; 141 my $out = []; 142 143 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS", 144 "ISCII", "$encoding.txt"); 145 return $out unless &loadmapping ($encoding, $mapfile); 146 147 my $i = 0; 148 my $len = length($in); 149 while ($i < $len) { 150 my $c = ord(substr ($in, $i, 1)); 151 $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0); 152 push (@$out, $c); 153 $i++; 154 } 155 156 return $out; 157 } 135 158 136 159 # ascii2utf8 takes a (extended) ascii string and … … 169 192 170 193 foreach $num (@$in) { 194 next unless defined $num; 171 195 if ($num < 0x80) { 172 196 $out .= chr ($num); … … 326 350 327 351 352 353 354 355 356 357 358 359 #################################################################################################### 360 361 362 # %translations is of the form: 363 # 364 # encodings{encodingname-encodingname}->blocktranslation 365 # blocktranslation->[[0-255],[256-511], ..., [65280-65535]] 366 # 367 # Any of the top translation blocks can point to an undefined 368 # value. This data structure aims to allow fast translation and 369 # efficient storage. 370 %translations = (); 371 372 # @array256 is used for initialisation, there must be 373 # a better way... 374 @array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 375 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 376 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 377 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 378 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 379 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 380 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 381 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 382 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 383 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 384 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 385 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 386 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 387 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 388 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 389 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); 390 391 $encodings = { 392 'iso_8859_1' => {'fullname' => 'Latin1 (western languages)', 393 'mapfile' => '8859_1.ump', 'ascii_delim' => 0xA0}, 394 395 'iso_8859_2' => {'fullname' => 'Latin2 (central and eastern european languages)', 396 'mapfile' => '8859_2.ump', 'ascii_delim' => 0xA0}, 397 398 'iso_8859_3' => {'fullname' => 'Latin3', 399 'mapfile' => '8859_3.ump', 'ascii_delim' => 0xA0}, 400 401 'iso_8859_4' => {'fullname' => 'Latin4', 402 'mapfile' => '8859_4.ump', 'ascii_delim' => 0xA0}, 403 404 'iso_8859_5' => {'fullname' => 'Cyrillic', 405 'mapfile' => '8859_5.ump', 'ascii_delim' => 0xA0}, 406 407 'iso_8859_6' => {'fullname' => 'Arabic', 408 'mapfile' => '8859_6.ump', 'ascii_delim' => 0xA0}, 409 410 'iso_8859_7' => {'fullname' => 'Greek', 411 'mapfile' => '8859_7.ump', 'ascii_delim' => 0xA0}, 412 413 'iso_8859_8' => {'fullname' => 'Hebrew', 414 'mapfile' => '8859_8.ump', 'ascii_delim' => 0xA0}, 415 416 'iso_8859_9' => {'fullname' => 'Latin5', 417 'mapfile' => '8859_9.ump', 'ascii_delim' => 0xA0}, 418 419 'windows_1250' => {'fullname' => 'Windows codepage 1250 (WinLatin2)', 420 'mapfile' => 'win1250.ump', 'ascii_delim' => 0x80}, 421 422 'windows_1251' => {'fullname' => 'Windows codepage 1251 (WinCyrillic)', 423 'mapfile' => 'win1251.ump', 'ascii_delim' => 0x80}, 424 425 'windows_1252' => {'fullname' => 'Windows codepage 1252 (WinLatin1)', 426 'mapfile' => 'win1252.ump', 'ascii_delim' => 0x80}, 427 428 'windows_1253' => {'fullname' => 'Windows codepage 1253 (WinGreek)', 429 'mapfile' => 'win1253.ump', 'ascii_delim' => 0x80}, 430 431 'windows_1254' => {'fullname' => 'Windows codepage 1254 (WinTurkish)', 432 'mapfile' => 'win1254.ump', 'ascii_delim' => 0x80}, 433 434 'windows_1255' => {'fullname' => 'Windows codepage 1255 (WinHebrew)', 435 'mapfile' => 'win1255.ump', 'ascii_delim' => 0x80}, 436 437 'windows_1256' => {'fullname' => 'Windows codepage 1256 (WinArabic)', 438 'mapfile' => 'win1256.ump', 'ascii_delim' => 0x80}, 439 440 'windows_1257' => {'fullname' => 'Windows codepage 1257 (WinBaltic)', 441 'mapfile' => 'win1257.ump', 'ascii_delim' => 0x80}, 442 443 'windows_1258' => {'fullname' => 'Windows codepage 1258 (Vietnamese)', 444 'mapfile' => 'win1258.ump', 'ascii_delim' => 0x80}, 445 446 'windows_874' => {'fullname' => 'Windows codepage 874 (Thai)', 447 'mapfile' => 'win874.ump', 'ascii_delim' => 0x80}, 448 449 'koi8_r' => {'fullname' => 'Cyrillic', 450 'mapfile' => 'koi8_r.ump', 'ascii_delim' => 0x80}, 451 452 'koi8_u' => {'fullname' => 'Cyrillic (Ukrainian)', 453 'mapfile' => 'koi8_u.ump', 'ascii_delim' => 0x80}, 454 455 'iscii_de' => {'fullname' => 'ISCII Devanagari', 456 'mapfile' => 'iscii_de.ump', 'ascii_delim' => 0xA0} 457 }; 458 459 # returns a pointer to unicode array 460 sub simple2unicode { 461 my ($encoding, $intext) = @_; 462 463 if (!defined ($encodings->{$encoding})) { 464 print STDERR "unicode::simple2unicode: ERROR: $encoding encoding not supported\n"; 465 return []; 466 } 467 468 my $info = $encodings->{$encoding}; 469 my $encodename = "$encoding-unicode"; 470 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings", "to_uc", 471 $info->{'mapfile'}); 472 473 if (!&loadmapencoding ($encodename, $mapfile)) { 474 print STDERR "unicode: ERROR - could not load encoding $encodename\n"; 475 return []; 476 } 477 478 my @outtext = (); 479 my $len = length($intext); 480 my ($c); 481 my $i = 0; 482 483 while ($i < $len) { 484 if (($c = ord(substr($intext, $i, 1))) < $info->{'ascii_delim'}) { 485 # normal ascii character 486 push (@outtext, $c); 487 } else { 488 push (@outtext, &transchar ($encodename, $c)); 489 } 490 $i ++; 491 } 492 return \@outtext; 493 } 494 495 # returns 1 if successful, 0 if unsuccessful 496 sub loadmapencoding { 497 my ($encoding, $mapfile) = @_; 498 499 # check to see if the encoding has already been loaded 500 return 1 if (defined $translations{$encoding}); 501 502 return 0 unless open (MAPFILE, $mapfile); 503 binmode (MAPFILE); 504 505 $translations{$encoding} = [@array256]; 506 my $block = $translations{$encoding}; 507 508 my ($in,$i,$j); 509 while (read(MAPFILE, $in, 1) == 1) { 510 $i = unpack ("C", $in); 511 $block->[$i] = [@array256]; 512 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) { 513 my ($n1, $n2) = unpack ("CC", $in); 514 $block->[$i]->[$j] = ($n1*256) + $n2; 515 } 516 } 517 518 close (MAPFILE); 519 } 520 521 sub transchar { 522 my ($encoding, $from) = @_; 523 my $high = ($from / 256) % 256; 524 my $low = $from % 256; 525 526 return 0 unless defined $translations{$encoding}; 527 528 my $block = $translations{$encoding}; 529 530 if (ref ($block->[$high]) ne "ARRAY") { 531 return 0; 532 } 533 return $block->[$high]->[$low]; 534 } 535 536 537 538 328 539 1; 540 -
trunk/gsdl/unicode/MAPPINGS/ISCII/Devanagari.txt
r1522 r1868 1 1 # ISCII / IS 13194:1991 2 3 # This table was generated by Stuart ([email protected]) for4 # the Greenstone Digital Library software from the ISCII (Indian5 # Script Code for Information Interchange). It maps from the6 # ISCII 7 bit code page covering Latin and Indian Scripts to7 # the Unicode 0900-907F range.8 2 9 3 # see Unicode Standard Version 2.0 pages 7-72 … … 11 5 12 6 #LETTERS 13 0xA1 0x0901# DEVANAGARI VOWEL-MODIFIER CHANDRABINDU14 0xA2 0x0902# DEVANAGARI VOWEL-MODIFIER ANUSWAR15 0xA3 0x0903# DEVANAGARI VOWEL-MODIFIER VISARG7 0xA1 0x0901 # DEVANAGARI VOWEL-MODIFIER CHANDRABINDU 8 0xA2 0x0902 # DEVANAGARI VOWEL-MODIFIER ANUSWAR 9 0xA3 0x0903 # DEVANAGARI VOWEL-MODIFIER VISARG 16 10 17 0xA4 0x0905# DEVANAGARI VOWEL A18 0xA5 0x0906# DEVANAGARI VOWEL AA19 0xA6 0x0907# DEVANAGARI VOWEL I20 0xA7 0x0908# DEVANAGARI VOWEL II21 0xA8 0x0909# DEVANAGARI VOWEL U22 0xA9 0x090A# DEVANAGARI VOWEL UU23 0xAA 0x090B# DEVANAGARI VOWEL RI24 0xAB 0x090E# DEVANAGARI VOWEL E (SOUTHERN SCRIPTS)25 0xAC 0x090F# DEVANAGARI VOWEL EY26 0xAD 0x0910# DEVANAGARI VOWEL AI27 0xAE 0x090D# DEVANAGARI VOWEL AYE (DEVANAGARI SCRIPT)28 0xAF 0x0912# DEVANAGARI VOWEL O (SOUTHERN SCRIPTS)11 0xA4 0x0905 # DEVANAGARI VOWEL A 12 0xA5 0x0906 # DEVANAGARI VOWEL AA 13 0xA6 0x0907 # DEVANAGARI VOWEL I 14 0xA7 0x0908 # DEVANAGARI VOWEL II 15 0xA8 0x0909 # DEVANAGARI VOWEL U 16 0xA9 0x090A # DEVANAGARI VOWEL UU 17 0xAA 0x090B # DEVANAGARI VOWEL RI 18 0xAB 0x090E # DEVANAGARI VOWEL E (SOUTHERN SCRIPTS) 19 0xAC 0x090F # DEVANAGARI VOWEL EY 20 0xAD 0x0910 # DEVANAGARI VOWEL AI 21 0xAE 0x090D # DEVANAGARI VOWEL AYE (DEVANAGARI SCRIPT) 22 0xAF 0x0912 # DEVANAGARI VOWEL O (SOUTHERN SCRIPTS) 29 23 30 0xB0 0x0913# DEVANAGARI VOWEL OW31 0xB1 0x0914# DEVANAGARI VOWEL AU32 0xB2 0x0911 # DEVANAGARI VOWEL AWE(DEVANAGARI SCRIPT)33 0xB3 0x0915# DEVANAGARI CONSONANT KA34 0xB4 0x0916# DEVANAGARI CONSONANT KHA35 0xB5 0x0917# DEVANAGARI CONSONANT GA36 0xB6 0x0918# DEVANAGARI CONSONANT GHA37 0xB7 0x0919# DEVANAGARI CONSONANT NGA38 0xB8 0x091A# DEVANAGARI CONSONANT CHA39 0xB9 0x091B# DEVANAGARI CONSONANT CHHA40 0xBA 0x091C# DEVANAGARI CONSONANT JA41 0xBB 0x091D# DEVANAGARI CONSONANT JHA42 0xBC 0x091E# DEVANAGARI CONSONANT JNA43 0xBD 0x091F# DEVANAGARI CONSONANT HARD TA44 0xBE 0x0920# DEVANAGARI CONSONANT HARD THA45 0xBF 0x0921# DEVANAGARI CONSONANT HARD DA24 0xB0 0x0913 # DEVANAGARI VOWEL OW 25 0xB1 0x0914 # DEVANAGARI VOWEL AU 26 0xB2 0x0911 # DEVANAGARI VOWEL AWE (DEVANAGARI SCRIPT) 27 0xB3 0x0915 # DEVANAGARI CONSONANT KA 28 0xB4 0x0916 # DEVANAGARI CONSONANT KHA 29 0xB5 0x0917 # DEVANAGARI CONSONANT GA 30 0xB6 0x0918 # DEVANAGARI CONSONANT GHA 31 0xB7 0x0919 # DEVANAGARI CONSONANT NGA 32 0xB8 0x091A # DEVANAGARI CONSONANT CHA 33 0xB9 0x091B # DEVANAGARI CONSONANT CHHA 34 0xBA 0x091C # DEVANAGARI CONSONANT JA 35 0xBB 0x091D # DEVANAGARI CONSONANT JHA 36 0xBC 0x091E # DEVANAGARI CONSONANT JNA 37 0xBD 0x091F # DEVANAGARI CONSONANT HARD TA 38 0xBE 0x0920 # DEVANAGARI CONSONANT HARD THA 39 0xBF 0x0921 # DEVANAGARI CONSONANT HARD DA 46 40 47 0xC0 0x0922# DEVANAGARI CONSONANT HARD DHA48 0xC1 0x0923# DEVANAGARI CONSONANT HARD NA49 0xC2 0x0924# DEVANAGARI CONSONANT SOFT TA50 0xC3 0x0925# DEVANAGARI CONSONANT SOFT THA51 0xC4 0x0926# DEVANAGARI CONSONANT SOFT DA52 0xC5 0x0927# DEVANAGARI CONSONANT SOFT DHA53 0xC6 0x0928# DEVANAGARI CONSONANT SOFT NA54 0xC7 0x0929# DEVANAGARI CONSONANT NA (TAMIL)55 0xC8 0x092A# DEVANAGARI CONSONANT PA56 0xC9 0x092B# DEVANAGARI CONSONANT PHA57 0xCA 0x092C# DEVANAGARI CONSONANT BA58 0xCB 0x092D# DEVANAGARI CONSONANT BHA59 0xCC 0x092E# DEVANAGARI CONSONANT MA60 0xCD 0x092F# DEVANAGARI CONSONANT YA41 0xC0 0x0922 # DEVANAGARI CONSONANT HARD DHA 42 0xC1 0x0923 # DEVANAGARI CONSONANT HARD NA 43 0xC2 0x0924 # DEVANAGARI CONSONANT SOFT TA 44 0xC3 0x0925 # DEVANAGARI CONSONANT SOFT THA 45 0xC4 0x0926 # DEVANAGARI CONSONANT SOFT DA 46 0xC5 0x0927 # DEVANAGARI CONSONANT SOFT DHA 47 0xC6 0x0928 # DEVANAGARI CONSONANT SOFT NA 48 0xC7 0x0929 # DEVANAGARI CONSONANT NA (TAMIL) 49 0xC8 0x092A # DEVANAGARI CONSONANT PA 50 0xC9 0x092B # DEVANAGARI CONSONANT PHA 51 0xCA 0x092C # DEVANAGARI CONSONANT BA 52 0xCB 0x092D # DEVANAGARI CONSONANT BHA 53 0xCC 0x092E # DEVANAGARI CONSONANT MA 54 0xCD 0x092F # DEVANAGARI CONSONANT YA 61 55 # WARNING: THIS CHARACTER IS NON-CANNONICAL 62 0xCE 0x095F# DEVANAGARI CONSONANT JKA (BENGALI, ASSAMESE & ORIYA)63 0xCF 0x0930# DEVANAGARI CONSONANT RA56 0xCE 0x095F # DEVANAGARI CONSONANT JKA (BENGALI, ASSAMESE & ORIYA) 57 0xCF 0x0930 # DEVANAGARI CONSONANT RA 64 58 65 0xD0 0x0931# DEVANAGARI CONSONANT HARD RA (SOUTHERN SCRIPTS)66 0xD1 0x0932# DEVANAGARI CONSONANT LA67 0xD2 0x0933# DEVANAGARI CONSONANT HARD LA68 0xD3 0x0934# DEVANAGARI CONSONANT ZHA (TAMIL & MALAYALAM)69 0xD4 0x0935# DEVANAGARI CONSONANT VA70 0xD5 0x0936# DEVANAGARI CONSONANT SHA71 0xD6 0x0937# DEVANAGARI CONSONANT HARD SHA72 0xD7 0x0938# DEVANAGARI CONSONANT SA73 0xD8 0x0939# DEVANAGARI CONSONANT HA74 #0xD9 0x0900# DEVANAGARI INVISIBLE (NO UNICODE EQUALIVENT)75 0xDA 0x093E# DEVANAGARI VOWEL SIGN AA76 0xDB 0x093F# DEVANAGARI VOWEL SIGN I77 0xDC 0x0940# DEVANAGARI VOWEL SIGN II78 0xDD 0x0941# DEVANAGARI VOWEL SIGN U79 0xDE 0x0942# DEVANAGARI VOWEL SIGN UU80 0xDF 0x0943# DEVANAGARI VOWEL SIGN RI59 0xD0 0x0931 # DEVANAGARI CONSONANT HARD RA (SOUTHERN SCRIPTS) 60 0xD1 0x0932 # DEVANAGARI CONSONANT LA 61 0xD2 0x0933 # DEVANAGARI CONSONANT HARD LA 62 0xD3 0x0934 # DEVANAGARI CONSONANT ZHA (TAMIL & MALAYALAM) 63 0xD4 0x0935 # DEVANAGARI CONSONANT VA 64 0xD5 0x0936 # DEVANAGARI CONSONANT SHA 65 0xD6 0x0937 # DEVANAGARI CONSONANT HARD SHA 66 0xD7 0x0938 # DEVANAGARI CONSONANT SA 67 0xD8 0x0939 # DEVANAGARI CONSONANT HA 68 #0xD9 0x0900 # DEVANAGARI INVISIBLE (NO UNICODE EQUALIVENT) 69 0xDA 0x093E # DEVANAGARI VOWEL SIGN AA 70 0xDB 0x093F # DEVANAGARI VOWEL SIGN I 71 0xDC 0x0940 # DEVANAGARI VOWEL SIGN II 72 0xDD 0x0941 # DEVANAGARI VOWEL SIGN U 73 0xDE 0x0942 # DEVANAGARI VOWEL SIGN UU 74 0xDF 0x0943 # DEVANAGARI VOWEL SIGN RI 81 75 82 0xE0 0x0946# DEVANAGARI VOWEL SIGN E (SOUTHERN SCRIPTS)83 0xE1 0x0947# DEVANAGARI VOWEL SIGN EY84 0xE2 0x0948# DEVANAGARI VOWEL SIGN AI85 0xE3 0x0945# DEVANAGARI VOWEL SIGN AYE (DEVANAGARI SCRIPT)86 0xE4 0x094A# DEVANAGARI VOWEL SIGN O SOUTHERN SCRIPTS)87 0xE5 0x094B# DEVANAGARI VOWEL SIGN OW88 0xE6 0x094C# DEVANAGARI VOWEL SIGN AU89 0xE7 0x0949# DEVANAGARI VOWEL SIGN AWE (DEVANAGARI SCRIPT)90 0xE8 0x094D# DEVANAGARI VOWEL SIGN OMISSION SIGN (HALANT)76 0xE0 0x0946 # DEVANAGARI VOWEL SIGN E (SOUTHERN SCRIPTS) 77 0xE1 0x0947 # DEVANAGARI VOWEL SIGN EY 78 0xE2 0x0948 # DEVANAGARI VOWEL SIGN AI 79 0xE3 0x0945 # DEVANAGARI VOWEL SIGN AYE (DEVANAGARI SCRIPT) 80 0xE4 0x094A # DEVANAGARI VOWEL SIGN O SOUTHERN SCRIPTS) 81 0xE5 0x094B # DEVANAGARI VOWEL SIGN OW 82 0xE6 0x094C # DEVANAGARI VOWEL SIGN AU 83 0xE7 0x0949 # DEVANAGARI VOWEL SIGN AWE (DEVANAGARI SCRIPT) 84 0xE8 0x094D # DEVANAGARI VOWEL SIGN OMISSION SIGN (HALANT) 91 85 92 86 #PUNCTUATION 93 0xE9 0x093C# DEVANAGARI DIACRITIC SIGN (NUKTA)94 0xEA 0x0964# DEVANAGARI FULL STOP87 0xE9 0x093C # DEVANAGARI DIACRITIC SIGN (NUKTA) 88 0xEA 0x0964 # DEVANAGARI FULL STOP 95 89 96 90 #DIGITS 97 0xF1 0x0966# DEVANAGARI DIGIT ZERO98 0xF2 0x0967# DEVANAGARI DIGIT ONE99 0xF3 0x0968# DEVANAGARI DIGIT TWO100 0xF4 0x0969# DEVANAGARI DIGIT THREE101 0xF5 0x096A# DEVANAGARI DIGIT FOUR102 0xF6 0x096B# DEVANAGARI DIGIT FIVE103 0xF7 0x096C# DEVANAGARI DIGIT SIX104 0xF8 0x096D# DEVANAGARI DIGIT SEVEN105 0xF9 0x096E# DEVANAGARI DIGIT EIGHT106 0xFA 0x096F# DEVANAGARI DIGIT NINE91 0xF1 0x0966 # DEVANAGARI DIGIT ZERO 92 0xF2 0x0967 # DEVANAGARI DIGIT ONE 93 0xF3 0x0968 # DEVANAGARI DIGIT TWO 94 0xF4 0x0969 # DEVANAGARI DIGIT THREE 95 0xF5 0x096A # DEVANAGARI DIGIT FOUR 96 0xF6 0x096B # DEVANAGARI DIGIT FIVE 97 0xF7 0x096C # DEVANAGARI DIGIT SIX 98 0xF8 0x096D # DEVANAGARI DIGIT SEVEN 99 0xF9 0x096E # DEVANAGARI DIGIT EIGHT 100 0xFA 0x096F # DEVANAGARI DIGIT NINE
Note:
See TracChangeset
for help on using the changeset viewer.