#!/usr/bin/perl # -*-Perl-*- #--- # $Id: shape_arabic.pl 13536 2006-12-20 20:47:58Z mdewsnip $ # # ------------ # Description: # ------------ # Perl script that does arabic utf-8 shaping # # - Note how easy shaping really is. # - This should be used an a learning tool for those interested # in Arabic and in perl (its a learning medium). # # ----------------- # Revision Details: (Updated by Revision Control System) # ----------------- # $Date: 2006-12-20 20:47:58 +0000 (Wed, 20 Dec 2006) $ # $Author: mdewsnip $ # $Revision: 13536 $ # $Source$ # # (www.arabeyes.org - under GPL license) #--- # Point to and use Perl version 5.8 (or greater) use 5.8.0; # Try to be clean in variable usage, etc use strict; # Enable full UTF-8 support use utf8; use open ':utf8'; binmode(STDOUT, ":utf8"); # Enable Automatic command-line parsing require "newgetopt.pl"; my ( $this_script, $opt_debug, $opt_file, $out_line, ); # Specify global variable values and init some Variables $this_script = $0; $this_script =~ s|^.*/([^/]*)|$1|; # Process the command line &get_args(); if (! $opt_file ) { print "ERROR($this_script): Specify input file to process with '-file' option\n"; &usage(1); } open (INFILE, "< $opt_file") or die "Can't open $opt_file - $! \n"; while () { $out_line .= &shape_line($_); } print $out_line; exit(0); ### ### ######## Procedures ######## ### ### ## # Do the actual Arabic Shaping sub shape_line { my ($line) = @_; use constant ISOLATED => 0; use constant INITIAL => 1; use constant MEDIAL => 2; use constant FINAL => 3; my ( @char, $cur, $prev, $next, $index, $prev_index, $next_index, $out, $str, ); # Hex values noted for all entries. # iso-8859-6 => [ s, i, m, f] my %map = ( "621" => ["FE80", 0, 0, 0], # HAMZA "622" => ["FE81", 0, 0, "FE82"], # ALEF_MADDA "623" => ["FE83", 0, 0, "FE84"], # ALEF_HAMZA_ABOVE "624" => ["FE85", 0, 0, "FE86"], # WAW_HAMZA "625" => ["FE87", 0, 0, "FE88"], # ALEF_HAMZA_BELOW "626" => ["FE89", "FE8B", "FE8C", "FE8A"], # YEH_HAMZA "627" => ["FE8D", 0, 0, "FE8E"], # ALEF "628" => ["FE8F", "FE91", "FE92", "FE90"], # BEH "629" => ["FE93", 0, 0, "FE94"], # TEH_MARBUTA "62A" => ["FE95", "FE97", "FE98", "FE96"], # TEH "62B" => ["FE99", "FE9B", "FE9C", "FE9A"], # THEH "62C" => ["FE9D", "FE9F", "FEA0", "FE9E"], # JEEM "62D" => ["FEA1", "FEA3", "FEA4", "FEA2"], # HAH "62E" => ["FEA5", "FEA7", "FEA8", "FEA6"], # KHAH "62F" => ["FEA9", 0, 0, "FEAA"], # DAL "630" => ["FEAB", 0, 0, "FEAC"], # THAL "631" => ["FEAD", 0, 0, "FEAE"], # REH "632" => ["FEAF", 0, 0, "FEB0"], # ZAIN "633" => ["FEB1", "FEB3", "FEB4", "FEB2"], # SEEN "634" => ["FEB5", "FEB7", "FEB8", "FEB6"], # SHEEN "635" => ["FEB9", "FEBB", "FEBC", "FEBA"], # SAD "636" => ["FEBD", "FEBF", "FEC0", "FEBE"], # DAD "637" => ["FEC1", "FEC3", "FEC4", "FEC2"], # TAH "638" => ["FEC5", "FEC7", "FEC8", "FEC6"], # ZAH "639" => ["FEC9", "FECB", "FECC", "FECA"], # AIN "63A" => ["FECD", "FECF", "FED0", "FECE"], # GHAIN "640" => [ "640", 0, 0, 0], # TATWEEL "641" => ["FED1", "FED3", "FED4", "FED2"], # FEH "642" => ["FED5", "FED7", "FED8", "FED6"], # QAF "643" => ["FED9", "FEDB", "FEDC", "FEDA"], # KAF "644" => ["FEDD", "FEDF", "FEE0", "FEDE"], # LAM "645" => ["FEE1", "FEE3", "FEE4", "FEE2"], # MEEM "646" => ["FEE5", "FEE7", "FEE8", "FEE6"], # NOON "647" => ["FEE9", "FEEB", "FEEC", "FEEA"], # HEH "648" => ["FEED", 0, 0, "FEEE"], # WAW "649" => ["FEEF", 0, 0, "FEF0"], # ALEF_MAKSURA "64A" => ["FEF1", "FEF3", "FEF4", "FEF2"], # YEH # exceptions "644622" => ["FEF5", 0, 0, "FEF6"], # LAM_ALEF_MADDA "644623" => ["FEF7", 0, 0, "FEF8"], # LAM_ALEF_HAMZA_ABOVE "644625" => ["FEF9", 0, 0, "FEFA"], # LAM_ALEF_HAMZA_BELOW "644627" => ["FEFB", 0, 0, "FEFC"], # LAM_ALEF ## ---- My additions for Farsi ---- # I think there is a bug in this script which causes an initial "Peh" # to be treated as isolated. This is why the isolated value for "Peh" # is set to the initial value. "67E" => [ "FB58", "FB58", "FB59", "FB57" ], # FARSI PEH "686" => [ "FB7A", "FB7C", "FB7D", "FB7B" ], # FARSI TCHEH "6AF" => [ "FB92", "FB94", "FB95", "FB93" ], # FARSI GAF "6CC" => [ "FBFC", "FBFE", "FBFF", "FBFD" ], # FARSI YEH ); my %special_is_next = ( "640" => 1 ); my %special_is_comb1 = ( "644" => 1 ); my %special_is_comb2 = ( "622" => 1, "623" => 1, "625" => 1, "627" => 1, ); my %special_is_composing = ( "64B" => 1, # FATHATAN "64C" => 1, # DAMMATAN "64D" => 1, # KASRATAN "64E" => 1, # FATHA "64F" => 1, # DAMMA "650" => 1, # KASRA "651" => 1, # SHADDA "652" => 1, # SUKUN "653" => 1, # MADDAH ABOVE "654" => 1, # HAMZA ABOVE "655" => 1, # HAMZA BELOW "670" => 1, # SUPERSCRIPT ALEF "6D6" => 1, # HIGH LIG. SAD WITH LAM WITH ALEF MAKSURA "6D7" => 1, # HIGH LIG. QAF WITH LAM WITH ALEF MAKSURA "6D8" => 1, # HIGH MEEM INITIAL FORM "6D9" => 1, # HIGH LAM ALEF "6DA" => 1, # HIGH JEEM "6DB" => 1, # HIGH THREE DOTS "6DC" => 1, # HIGH SEEN # The 2 entires below should not be here - contact unicode.org !! # "6DD" => 1, # END OF AYAH # "6DE" => 1, # START OF RUB EL HIZB "6DF" => 1, # HIGH ROUNDED ZERO "6E0" => 1, # HIGH UPRIGHT RECTANGULAR ZERO "6E1" => 1, # HIGH DOTLESS HEAD OF KHAH "6E2" => 1, # HIGH MEEM ISOLATED FORM "6E3" => 1, # LOW SEEN "6E4" => 1, # HIGH MADDA "6E7" => 1, # HIGH YEH "6E8" => 1, # HIGH NOON "6EA" => 1, # EMPTY CENTRE LOW STOP "6EB" => 1, # EMPTY CENTRE HIGH STOP "6EC" => 1, # HIGH STOP WITH FILLED CENTRE "6ED" => 1, # LOW MEEM ); @char = split ("", $line); for ($index = 0; $index <= $#char; $index++) { $cur = unpack("U", $char[$index]); $cur = sprintf "\U%x\E", $cur; if ( $opt_debug ) { print "---- $index ----\n"; print "NOW - $cur \n"; } if ( defined $map{$cur} ) { # Previous character status $prev_index = $index; PREV_CHECK: { $prev_index--; $prev = unpack("U", $char[$prev_index]); $prev = sprintf "\U%x\E", $prev; if ( $opt_debug ) { print "prev - $prev \n"; } # Get rid of all previous composers if ( $special_is_composing{$prev} ) { redo PREV_CHECK; } } if ( ($index == 0) || !defined $map{$prev} || (!$map{$prev}[INITIAL] && !$map{$prev}[MEDIAL] ) ) { # Don't have a 'prev' if ( $opt_debug ) { print "prev not defined \n"; } undef $prev; } # Next character status $next_index = $index; NEXT_CHECK: { $next_index++; $next = unpack("U", $char[$next_index]); $next = sprintf "\U%x\E", $next; if ( $opt_debug ) { print "next - $next \n"; } # Get rid of all next composers if ( $special_is_composing{$next} ) { redo NEXT_CHECK; } } if ( ($index == $#char) || !defined $map{$next} || (!$map{$next}[MEDIAL] && !$map{$next}[FINAL] && !$special_is_next{$next} ) ) { # Don't have a 'next' if ( $opt_debug ) { print "next not defined \n"; } undef $next; } # Shape the special combinational characters if ( $special_is_comb1{$cur} ) { if (defined $next ) { if ( $special_is_comb2{$next} ) { $out = ( (defined $prev) ? $map{"$cur$next"}[FINAL] : $map{"$cur$next"}[ISOLATED] ); $str .= pack("U", hex($out)); if ( $opt_debug ) { print "Got me a complex - use $out ($str)\n"; } $index++; next; } } } # Medial if ( (defined $prev) && (defined $next) ) { # In case there is no medial, move to next :-) if ( $out = $map{$cur}[MEDIAL] ) { $str .= pack("U", hex($out)); if ( $opt_debug ) { print "Got prev & next - use $out ($str)\n"; } next; } } # Final if ( (defined $prev) ) { if ( $out = $map{$cur}[FINAL] ) { $str .= pack("U", hex($out)); if ( $opt_debug ) { print "Got prev - use $out ($str)\n"; } next; } } # Initial if ( (defined $next) ) { if ( $out = $map{$cur}[INITIAL] ) { $str .= pack("U", hex($out)); if ( $opt_debug ) { print "Got next - use $out ($str)\n"; } next; } } # Isolated/Seperated (Stand-alone) $out = $map{$cur}[ISOLATED]; $str .= pack("U", hex($out)); if ( $opt_debug ) { print "Got nothing - use $out ($str)\n"; } } else { $str .= pack("U", hex($cur)); } } if ( $opt_debug ) { print "return - $str \n"; } return ($str); } ## # Print the simple usage sub usage { my ($die_after) = @_; print "Usage: $this_script <-file filename> [-debug] \n"; if ( $die_after ) { exit(5); } } ## # Get the command line arguments sub get_args { &NGetOpt ( "debug" => \$opt_debug, # enable the printing of debug output "file=s" => \$opt_file, # specify a filename to process ) || ( $? = 257, die "Invalid argument\n" ); }