Changeset 1227
- Timestamp:
- 2000-06-21T15:58:49+12:00 (24 years ago)
- Location:
- trunk/gsdl
- Files:
-
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/bin/script/togb.pl
r1226 r1227 36 36 if (!parsargv::parse(\@ARGV, 37 37 'unicode', \$unicode, 38 ' latin1', \$latin1,38 'iso_8859_1', \$iso_8859_1, 39 39 'gb', \$gb)) { 40 40 print STDERR "\n usage: $0 [options]\n\n"; 41 41 print STDERR " options:\n"; 42 42 print STDERR " -unicode input is in utf-8 or unicode (default)\n"; 43 print STDERR " - latin1 input is in extended ascii (ISO-8859-1)\n";43 print STDERR " -iso_8859_1 input is in extended ascii (ISO-8859-1 Latin 1)\n"; 44 44 print STDERR " -gb input is in GB or GBK (simplified Chinese)\n\n"; 45 45 die "\n"; … … 47 47 48 48 $encoding = "utf8" if $unicode; 49 $encoding = " latin1" if $latin1;49 $encoding = "iso_8859_1" if $iso_8859_1; 50 50 $encoding = "gb" if $gb; 51 51 -
trunk/gsdl/bin/script/touc.pl
r1226 r1227 36 36 if (!parsargv::parse(\@ARGV, 37 37 'unicode', \$unicode, 38 'latin1', \$latin1, 39 'arabic', \$arabic, 38 'iso_8859_1', \$iso_8859_1, 39 'iso_8859_6', \$iso_8859_6, 40 'windows_1256', \$windows_1256, 40 41 'gb', \$gb)) { 41 42 print STDERR "\n usage: $0 [options]\n\n"; 42 43 print STDERR " options:\n"; 43 print STDERR " -unicode input is in utf-8 or unicode (default)\n"; 44 print STDERR " -latin1 input is in extended ascii (ISO-8859-1)\n"; 45 print STDERR " -arabic input is in 8 bit Arabic (ISO-8859-6)\n"; 46 print STDERR " -gb input is in GB or GBK (simplified Chinese)\n\n"; 44 print STDERR " -unicode input is in utf-8 or unicode (default)\n"; 45 print STDERR " -iso_8859_1 input is in extended ascii (ISO-8859-1 Latin 1)\n"; 46 print STDERR " -iso_8859_6 input is in 8 bit Arabic (ISO-8859-6)\n"; 47 print STDERR " -windows_1256 input is in Windows 1256 (Arabic)\n"; 48 print STDERR " -gb input is in GB or GBK (simplified Chinese)\n\n"; 47 49 die "\n"; 48 50 } 49 51 50 52 $encoding = "utf8" if $unicode; 51 $encoding = "latin1" if $latin1; 52 $encoding = "arabic" if $arabic; 53 $encoding = "iso_8859_1" if $iso_8859_1; 54 $encoding = "iso_8859_6" if $iso_8859_6; 55 $encoding = "windows_1256" if $windows_1256; 53 56 $encoding = "gb" if $gb; 54 57 -
trunk/gsdl/bin/script/toutf8.pl
r1226 r1227 36 36 if (!parsargv::parse(\@ARGV, 37 37 'unicode', \$unicode, 38 'latin1', \$latin1, 39 'arabic', \$arabic, 38 'iso_8859_1', \$iso_8859_1, 39 'iso_8859_6', \$iso_8859_6, 40 'windows_1256', \$windows_1256, 40 41 'gb', \$gb)) { 41 42 print STDERR "\n usage: $0 [options]\n\n"; 42 43 print STDERR " options:\n"; 43 print STDERR " -unicode input is in utf-8 or unicode (default)\n"; 44 print STDERR " -latin1 input is in extended ascii (ISO-8859-1)\n"; 45 print STDERR " -arabic input is in 8 bit Arabic (ISO-8859-6)\n"; 46 print STDERR " -gb input is in GB or GBK (simplified Chinese)\n\n"; 44 print STDERR " -unicode input is in utf-8 or unicode (default)\n"; 45 print STDERR " -iso_8859_1 input is in extended ascii (ISO-8859-1 Latin 1)\n"; 46 print STDERR " -iso_8859_6 input is in 8 bit Arabic (ISO-8859-6)\n"; 47 print STDERR " -windows_1256 input is in Windows 1256 (Arabic)\n"; 48 print STDERR " -gb input is in GB or GBK (simplified Chinese)\n\n"; 47 49 die "\n"; 48 50 } 49 51 50 52 $encoding = "utf8" if $unicode; 51 $encoding = "latin1" if $latin1; 52 $encoding = "arabic" if $arabic; 53 $encoding = "iso_8859_1" if $iso_8859_1; 54 $encoding = "iso_8859_6" if $iso_8859_6; 55 $encoding = "windows_1256" if $windows_1256; 53 56 $encoding = "gb" if $gb; 54 57 -
trunk/gsdl/perllib/multiread.pm
r1224 r1227 26 26 # encodings currently supported are 27 27 # 28 # utf8 - either utf8 or unicode (automatically detected) 29 # unicode - just unicode (doesn't currently do endian detection) 30 # gb - GB 31 # latin1 - extended ascii (iso-8859-1) 32 # arabic - 8 bit arabic (iso-8859-6) 28 # utf8 - either utf8 or unicode (automatically detected) 29 # unicode - just unicode (doesn't currently do endian detection) 30 # gb - GB 31 # iso_8859_1 - extended ascii (iso-8859-1) 32 # iso_8859_6 - 8 bit arabic (iso-8859-6) 33 # windows_1256 - Windows codepage 1256 (Arabic) 33 34 34 35 package multiread; … … 169 170 } 170 171 171 if ($self->{'encoding'} eq " latin1") {172 if ($self->{'encoding'} eq "iso_8859_1") { 172 173 # Latin 1 extended ascii (ISO-8859-1) 173 174 return undef if (eof ($handle)); … … 175 176 } 176 177 177 if ($self->{'encoding'} eq " arabic") {178 if ($self->{'encoding'} eq "iso_8859_6") { 178 179 # 8 bit Arabic (IOS-8859-6) 179 180 return undef if (eof ($handle)); 180 181 return &unicode::unicode2utf8(&unicode::arabic2unicode (getc ($handle))); 182 } 183 184 if ($self->{'encoding'} eq "windows_1256") { 185 # Windows 1256 (Arabic) 186 return undef if (eof ($handle)); 187 return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", getc ($handle))); 181 188 } 182 189 … … 242 249 } 243 250 244 if ($self->{'encoding'} eq " latin1") {251 if ($self->{'encoding'} eq "iso_8859_1") { 245 252 # extended ascii (ISO-8859-1) 246 253 my $line = ""; … … 251 258 } 252 259 253 if ($self->{'encoding'} eq " arabic") {260 if ($self->{'encoding'} eq "iso_8859_6") { 254 261 # 8 bit arabic (ISO-8859-6) 255 262 my $line = ""; … … 259 266 return undef; 260 267 } 268 269 if ($self->{'encoding'} eq "windows_1256") { 270 # Windows 1256 (Arabic) 271 my $line = ""; 272 if (defined ($line = <$handle>)) { 273 return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $line)); 274 } 275 return undef; 276 } 261 277 262 278 # unknown encoding … … 266 282 267 283 # will convert entire contents of file to utf8 and append result to $outputref 284 # this may be a slightly faster way to get the contents of a file than by 285 # recursively calling read_line() 268 286 sub read_file { 269 287 my $self = shift (@_); … … 298 316 if ($self->{'encoding'} eq "gb") { 299 317 undef $/; 300 $$outputref .= &unicode::unicode2utf8 (&gb::gb2unicode (<$handle>)); 301 $/ = "\n"; 302 return; 303 } 304 305 if ($self->{'encoding'} eq "latin1") { 306 undef $/; 307 $$outputref .= &unicode::ascii2utf8 (<$handle>); 308 $/ = "\n"; 309 return; 310 } 311 312 if ($self->{'encoding'} eq "arabic") { 313 undef $/; 314 $$outputref .= &unicode::unicode2utf8(&unicode::arabic2unicode (<$handle>)); 315 $/ = "\n"; 318 my $text = <$handle>; 319 $/ = "\n"; 320 $$outputref .= &unicode::unicode2utf8 (&gb::gb2unicode ($text)); 321 return; 322 } 323 324 if ($self->{'encoding'} eq "iso_8859_1") { 325 undef $/; 326 my $text = <$handle>; 327 $/ = "\n"; 328 $$outputref .= &unicode::ascii2utf8 ($text); 329 return; 330 } 331 332 if ($self->{'encoding'} eq "iso_8859_6") { 333 my $text = <$handle>; 334 undef $/; 335 $/ = "\n"; 336 $$outputref .= &unicode::unicode2utf8(&unicode::arabic2unicode ($text)); 337 return; 338 } 339 340 if ($self->{'encoding'} eq "windows_1256") { 341 undef $/; 342 my $text = <$handle>; 343 $/ = "\n"; 344 $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $text)); 316 345 return; 317 346 } -
trunk/gsdl/perllib/plugins/BasPlug.pm
r1219 r1227 47 47 print STDERR " gb (GB or GBK simplified Chinese)\n"; 48 48 print STDERR " iso_8859_6 (8 bit Arabic)\n"; 49 print STDERR " Arabic (the same as iso-8859-6)\n"; 49 print STDERR " windows_1256 (Windows codepage 1256 (Arabic))\n"; 50 print STDERR " Arabic (the same as windows_1256)\n"; 50 51 print STDERR " utf8 (either utf8 or unicode -- automatically detected)\n"; 51 52 print STDERR " unicode (just unicode -- doesn't currently do endian\n"; … … 57 58 58 59 my $self = {}; 59 my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6| Arabic|utf8|unicode)\$";60 my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|windows_1256|Arabic|utf8|unicode)\$"; 60 61 61 62 # general options available to all plugins … … 109 110 if ($self->{'input_encoding'} =~ /^(Latin1|iso_8859_1)$/) { 110 111 $encoding = "latin1"; 111 } elsif ($self->{'input_encoding'} =~ /^(Arabic| iso_8859_6)$/) {112 $encoding = " arabic";112 } elsif ($self->{'input_encoding'} =~ /^(Arabic|windows_1256)$/) { 113 $encoding = "windows_1256"; 113 114 } else { 114 115 $encoding = $self->{'input_encoding'}; -
trunk/gsdl/perllib/unicode.pm
r1223 r1227 31 31 package unicode; 32 32 33 33 %translations = (); 34 34 35 35 # ascii2unicode takes an (extended) ascii string (ISO-8859-1) … … 66 66 return $out; 67 67 } 68 69 # windows2unicode takes a windows encoding (e.g. Windows 1256 (Arabic)) 70 # and returns a unicode array. These encodings are similar to but not 71 # identical to the corresponding ISO-8859 encodings. 72 # 73 # The map files for these encodings should be in unicode/MAPPINGS/WINDOWS 74 sub windows2unicode { 75 my ($encoding, $in) = @_; 76 my $out = []; 77 78 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS", 79 "WINDOWS", "$encoding.TXT"); 80 return $out unless &loadmapping ($encoding, $mapfile); 81 82 my $i = 0; 83 my $len = length($in); 84 while ($i < $len) { 85 my $c = ord(substr ($in, $i, 1)); 86 $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80); 87 push (@$out, $c); 88 $i++; 89 } 90 91 return $out; 92 } 93 68 94 69 95 # ascii2utf8 takes a (extended) ascii string and … … 210 236 } 211 237 238 # loadmapping expects the mapfile to contain (at least) two 239 # tab-separated fields. The first field is the mapped value 240 # and the second field is the unicode value. 241 # 242 # It returns 1 if successful, 0 if unsuccessful 243 sub loadmapping { 244 my ($encoding, $mapfile) = @_; 245 246 my $to = "$encoding-unicode"; 247 my $from = "unicode-$encoding"; 248 249 # check to see if the encoding has already been loaded 250 if (defined $translations{$to} && defined $translations{$from}) { 251 return 1; 252 } 253 254 if (!open (MAPFILE, $mapfile)) { 255 print STDERR "ERROR: unable to load mapfile $mapfile\n"; 256 return 0; 257 } 258 259 my ($line, @line); 260 $translations{$to} = {}; 261 $translations{$from} = {}; 262 while (defined ($line = <MAPFILE>)) { 263 # remove comments 264 $line =~ s/\#.*$//; 265 next unless $line =~ /\S/; 266 267 # split the line into fields and do a few 268 # simple sanity checks 269 @line = split (/\t/, $line); 270 next unless (scalar(@line) >= 2 && 271 $line[0] =~ /^0x/ && 272 $line[1] =~ /^0x/); 273 274 my $a = hex($line[0]); 275 my $b = hex($line[1]); 276 277 $translations{$to}->{$a} = $b; 278 $translations{$from}->{$b} = $a; 279 } 280 281 close (MAPFILE); 282 283 return 1; 284 } 285 212 286 213 287 1; 214
Note:
See TracChangeset
for help on using the changeset viewer.