Changeset 1868 for trunk/gsdl/perllib/unicode.pm
- Timestamp:
- 2001-01-26T17:25:49+13:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/unicode.pm
r1844 r1868 133 133 } 134 134 135 # iscii2unicode is basically identical to iso2unicode, the only 136 # difference being that the map files live in unicode/MAPPINGS/ISCII 137 # 138 # values for $encoding may be 'Devanagari' only at present 139 sub iscii2unicode { 140 my ($encoding, $in) = @_; 141 my $out = []; 142 143 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS", 144 "ISCII", "$encoding.txt"); 145 return $out unless &loadmapping ($encoding, $mapfile); 146 147 my $i = 0; 148 my $len = length($in); 149 while ($i < $len) { 150 my $c = ord(substr ($in, $i, 1)); 151 $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0); 152 push (@$out, $c); 153 $i++; 154 } 155 156 return $out; 157 } 135 158 136 159 # ascii2utf8 takes a (extended) ascii string and … … 169 192 170 193 foreach $num (@$in) { 194 next unless defined $num; 171 195 if ($num < 0x80) { 172 196 $out .= chr ($num); … … 326 350 327 351 352 353 354 355 356 357 358 359 #################################################################################################### 360 361 362 # %translations is of the form: 363 # 364 # encodings{encodingname-encodingname}->blocktranslation 365 # blocktranslation->[[0-255],[256-511], ..., [65280-65535]] 366 # 367 # Any of the top translation blocks can point to an undefined 368 # value. This data structure aims to allow fast translation and 369 # efficient storage. 370 %translations = (); 371 372 # @array256 is used for initialisation, there must be 373 # a better way... 374 @array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 375 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 376 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 377 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 378 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 379 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 380 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 381 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 382 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 383 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 384 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 385 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 386 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 387 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 388 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 389 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); 390 391 $encodings = { 392 'iso_8859_1' => {'fullname' => 'Latin1 (western languages)', 393 'mapfile' => '8859_1.ump', 'ascii_delim' => 0xA0}, 394 395 'iso_8859_2' => {'fullname' => 'Latin2 (central and eastern european languages)', 396 'mapfile' => '8859_2.ump', 'ascii_delim' => 0xA0}, 397 398 'iso_8859_3' => {'fullname' => 'Latin3', 399 'mapfile' => '8859_3.ump', 'ascii_delim' => 0xA0}, 400 401 'iso_8859_4' => {'fullname' => 'Latin4', 402 'mapfile' => '8859_4.ump', 'ascii_delim' => 0xA0}, 403 404 'iso_8859_5' => {'fullname' => 'Cyrillic', 405 'mapfile' => '8859_5.ump', 'ascii_delim' => 0xA0}, 406 407 'iso_8859_6' => {'fullname' => 'Arabic', 408 'mapfile' => '8859_6.ump', 'ascii_delim' => 0xA0}, 409 410 'iso_8859_7' => {'fullname' => 'Greek', 411 'mapfile' => '8859_7.ump', 'ascii_delim' => 0xA0}, 412 413 'iso_8859_8' => {'fullname' => 'Hebrew', 414 'mapfile' => '8859_8.ump', 'ascii_delim' => 0xA0}, 415 416 'iso_8859_9' => {'fullname' => 'Latin5', 417 'mapfile' => '8859_9.ump', 'ascii_delim' => 0xA0}, 418 419 'windows_1250' => {'fullname' => 'Windows codepage 1250 (WinLatin2)', 420 'mapfile' => 'win1250.ump', 'ascii_delim' => 0x80}, 421 422 'windows_1251' => {'fullname' => 'Windows codepage 1251 (WinCyrillic)', 423 'mapfile' => 'win1251.ump', 'ascii_delim' => 0x80}, 424 425 'windows_1252' => {'fullname' => 'Windows codepage 1252 (WinLatin1)', 426 'mapfile' => 'win1252.ump', 'ascii_delim' => 0x80}, 427 428 'windows_1253' => {'fullname' => 'Windows codepage 1253 (WinGreek)', 429 'mapfile' => 'win1253.ump', 'ascii_delim' => 0x80}, 430 431 'windows_1254' => {'fullname' => 'Windows codepage 1254 (WinTurkish)', 432 'mapfile' => 'win1254.ump', 'ascii_delim' => 0x80}, 433 434 'windows_1255' => {'fullname' => 'Windows codepage 1255 (WinHebrew)', 435 'mapfile' => 'win1255.ump', 'ascii_delim' => 0x80}, 436 437 'windows_1256' => {'fullname' => 'Windows codepage 1256 (WinArabic)', 438 'mapfile' => 'win1256.ump', 'ascii_delim' => 0x80}, 439 440 'windows_1257' => {'fullname' => 'Windows codepage 1257 (WinBaltic)', 441 'mapfile' => 'win1257.ump', 'ascii_delim' => 0x80}, 442 443 'windows_1258' => {'fullname' => 'Windows codepage 1258 (Vietnamese)', 444 'mapfile' => 'win1258.ump', 'ascii_delim' => 0x80}, 445 446 'windows_874' => {'fullname' => 'Windows codepage 874 (Thai)', 447 'mapfile' => 'win874.ump', 'ascii_delim' => 0x80}, 448 449 'koi8_r' => {'fullname' => 'Cyrillic', 450 'mapfile' => 'koi8_r.ump', 'ascii_delim' => 0x80}, 451 452 'koi8_u' => {'fullname' => 'Cyrillic (Ukrainian)', 453 'mapfile' => 'koi8_u.ump', 'ascii_delim' => 0x80}, 454 455 'iscii_de' => {'fullname' => 'ISCII Devanagari', 456 'mapfile' => 'iscii_de.ump', 'ascii_delim' => 0xA0} 457 }; 458 459 # returns a pointer to unicode array 460 sub simple2unicode { 461 my ($encoding, $intext) = @_; 462 463 if (!defined ($encodings->{$encoding})) { 464 print STDERR "unicode::simple2unicode: ERROR: $encoding encoding not supported\n"; 465 return []; 466 } 467 468 my $info = $encodings->{$encoding}; 469 my $encodename = "$encoding-unicode"; 470 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings", "to_uc", 471 $info->{'mapfile'}); 472 473 if (!&loadmapencoding ($encodename, $mapfile)) { 474 print STDERR "unicode: ERROR - could not load encoding $encodename\n"; 475 return []; 476 } 477 478 my @outtext = (); 479 my $len = length($intext); 480 my ($c); 481 my $i = 0; 482 483 while ($i < $len) { 484 if (($c = ord(substr($intext, $i, 1))) < $info->{'ascii_delim'}) { 485 # normal ascii character 486 push (@outtext, $c); 487 } else { 488 push (@outtext, &transchar ($encodename, $c)); 489 } 490 $i ++; 491 } 492 return \@outtext; 493 } 494 495 # returns 1 if successful, 0 if unsuccessful 496 sub loadmapencoding { 497 my ($encoding, $mapfile) = @_; 498 499 # check to see if the encoding has already been loaded 500 return 1 if (defined $translations{$encoding}); 501 502 return 0 unless open (MAPFILE, $mapfile); 503 binmode (MAPFILE); 504 505 $translations{$encoding} = [@array256]; 506 my $block = $translations{$encoding}; 507 508 my ($in,$i,$j); 509 while (read(MAPFILE, $in, 1) == 1) { 510 $i = unpack ("C", $in); 511 $block->[$i] = [@array256]; 512 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) { 513 my ($n1, $n2) = unpack ("CC", $in); 514 $block->[$i]->[$j] = ($n1*256) + $n2; 515 } 516 } 517 518 close (MAPFILE); 519 } 520 521 sub transchar { 522 my ($encoding, $from) = @_; 523 my $high = ($from / 256) % 256; 524 my $low = $from % 256; 525 526 return 0 unless defined $translations{$encoding}; 527 528 my $block = $translations{$encoding}; 529 530 if (ref ($block->[$high]) ne "ARRAY") { 531 return 0; 532 } 533 return $block->[$high]->[$low]; 534 } 535 536 537 538 328 539 1; 540
Note:
See TracChangeset
for help on using the changeset viewer.