Changeset 1227 for trunk/gsdl/perllib
- Timestamp:
- 2000-06-21T15:58:49+12:00 (24 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/multiread.pm
r1224 r1227 26 26 # encodings currently supported are 27 27 # 28 # utf8 - either utf8 or unicode (automatically detected) 29 # unicode - just unicode (doesn't currently do endian detection) 30 # gb - GB 31 # latin1 - extended ascii (iso-8859-1) 32 # arabic - 8 bit arabic (iso-8859-6) 28 # utf8 - either utf8 or unicode (automatically detected) 29 # unicode - just unicode (doesn't currently do endian detection) 30 # gb - GB 31 # iso_8859_1 - extended ascii (iso-8859-1) 32 # iso_8859_6 - 8 bit arabic (iso-8859-6) 33 # windows_1256 - Windows codepage 1256 (Arabic) 33 34 34 35 package multiread; … … 169 170 } 170 171 171 if ($self->{'encoding'} eq " latin1") {172 if ($self->{'encoding'} eq "iso_8859_1") { 172 173 # Latin 1 extended ascii (ISO-8859-1) 173 174 return undef if (eof ($handle)); … … 175 176 } 176 177 177 if ($self->{'encoding'} eq " arabic") {178 if ($self->{'encoding'} eq "iso_8859_6") { 178 179 # 8 bit Arabic (IOS-8859-6) 179 180 return undef if (eof ($handle)); 180 181 return &unicode::unicode2utf8(&unicode::arabic2unicode (getc ($handle))); 182 } 183 184 if ($self->{'encoding'} eq "windows_1256") { 185 # Windows 1256 (Arabic) 186 return undef if (eof ($handle)); 187 return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", getc ($handle))); 181 188 } 182 189 … … 242 249 } 243 250 244 if ($self->{'encoding'} eq " latin1") {251 if ($self->{'encoding'} eq "iso_8859_1") { 245 252 # extended ascii (ISO-8859-1) 246 253 my $line = ""; … … 251 258 } 252 259 253 if ($self->{'encoding'} eq " arabic") {260 if ($self->{'encoding'} eq "iso_8859_6") { 254 261 # 8 bit arabic (ISO-8859-6) 255 262 my $line = ""; … … 259 266 return undef; 260 267 } 268 269 if ($self->{'encoding'} eq "windows_1256") { 270 # Windows 1256 (Arabic) 271 my $line = ""; 272 if (defined ($line = <$handle>)) { 273 return &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $line)); 274 } 275 return undef; 276 } 261 277 262 278 # unknown encoding … … 266 282 267 283 # will convert entire contents of file to utf8 and append result to $outputref 284 # this may be a slightly faster way to get the contents of a file than by 285 # recursively calling read_line() 268 286 sub read_file { 269 287 my $self = shift (@_); … … 298 316 if ($self->{'encoding'} eq "gb") { 299 317 undef $/; 300 $$outputref .= &unicode::unicode2utf8 (&gb::gb2unicode (<$handle>)); 301 $/ = "\n"; 302 return; 303 } 304 305 if ($self->{'encoding'} eq "latin1") { 306 undef $/; 307 $$outputref .= &unicode::ascii2utf8 (<$handle>); 308 $/ = "\n"; 309 return; 310 } 311 312 if ($self->{'encoding'} eq "arabic") { 313 undef $/; 314 $$outputref .= &unicode::unicode2utf8(&unicode::arabic2unicode (<$handle>)); 315 $/ = "\n"; 318 my $text = <$handle>; 319 $/ = "\n"; 320 $$outputref .= &unicode::unicode2utf8 (&gb::gb2unicode ($text)); 321 return; 322 } 323 324 if ($self->{'encoding'} eq "iso_8859_1") { 325 undef $/; 326 my $text = <$handle>; 327 $/ = "\n"; 328 $$outputref .= &unicode::ascii2utf8 ($text); 329 return; 330 } 331 332 if ($self->{'encoding'} eq "iso_8859_6") { 333 my $text = <$handle>; 334 undef $/; 335 $/ = "\n"; 336 $$outputref .= &unicode::unicode2utf8(&unicode::arabic2unicode ($text)); 337 return; 338 } 339 340 if ($self->{'encoding'} eq "windows_1256") { 341 undef $/; 342 my $text = <$handle>; 343 $/ = "\n"; 344 $$outputref .= &unicode::unicode2utf8(&unicode::windows2unicode ("1256", $text)); 316 345 return; 317 346 } -
trunk/gsdl/perllib/plugins/BasPlug.pm
r1219 r1227 47 47 print STDERR " gb (GB or GBK simplified Chinese)\n"; 48 48 print STDERR " iso_8859_6 (8 bit Arabic)\n"; 49 print STDERR " Arabic (the same as iso-8859-6)\n"; 49 print STDERR " windows_1256 (Windows codepage 1256 (Arabic))\n"; 50 print STDERR " Arabic (the same as windows_1256)\n"; 50 51 print STDERR " utf8 (either utf8 or unicode -- automatically detected)\n"; 51 52 print STDERR " unicode (just unicode -- doesn't currently do endian\n"; … … 57 58 58 59 my $self = {}; 59 my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6| Arabic|utf8|unicode)\$";60 my $encodings = "^(iso_8859_1|Latin1|ascii|gb|iso_8859_6|windows_1256|Arabic|utf8|unicode)\$"; 60 61 61 62 # general options available to all plugins … … 109 110 if ($self->{'input_encoding'} =~ /^(Latin1|iso_8859_1)$/) { 110 111 $encoding = "latin1"; 111 } elsif ($self->{'input_encoding'} =~ /^(Arabic| iso_8859_6)$/) {112 $encoding = " arabic";112 } elsif ($self->{'input_encoding'} =~ /^(Arabic|windows_1256)$/) { 113 $encoding = "windows_1256"; 113 114 } else { 114 115 $encoding = $self->{'input_encoding'}; -
trunk/gsdl/perllib/unicode.pm
r1223 r1227 31 31 package unicode; 32 32 33 33 %translations = (); 34 34 35 35 # ascii2unicode takes an (extended) ascii string (ISO-8859-1) … … 66 66 return $out; 67 67 } 68 69 # windows2unicode takes a windows encoding (e.g. Windows 1256 (Arabic)) 70 # and returns a unicode array. These encodings are similar to but not 71 # identical to the corresponding ISO-8859 encodings. 72 # 73 # The map files for these encodings should be in unicode/MAPPINGS/WINDOWS 74 sub windows2unicode { 75 my ($encoding, $in) = @_; 76 my $out = []; 77 78 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS", 79 "WINDOWS", "$encoding.TXT"); 80 return $out unless &loadmapping ($encoding, $mapfile); 81 82 my $i = 0; 83 my $len = length($in); 84 while ($i < $len) { 85 my $c = ord(substr ($in, $i, 1)); 86 $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80); 87 push (@$out, $c); 88 $i++; 89 } 90 91 return $out; 92 } 93 68 94 69 95 # ascii2utf8 takes a (extended) ascii string and … … 210 236 } 211 237 238 # loadmapping expects the mapfile to contain (at least) two 239 # tab-separated fields. The first field is the mapped value 240 # and the second field is the unicode value. 241 # 242 # It returns 1 if successful, 0 if unsuccessful 243 sub loadmapping { 244 my ($encoding, $mapfile) = @_; 245 246 my $to = "$encoding-unicode"; 247 my $from = "unicode-$encoding"; 248 249 # check to see if the encoding has already been loaded 250 if (defined $translations{$to} && defined $translations{$from}) { 251 return 1; 252 } 253 254 if (!open (MAPFILE, $mapfile)) { 255 print STDERR "ERROR: unable to load mapfile $mapfile\n"; 256 return 0; 257 } 258 259 my ($line, @line); 260 $translations{$to} = {}; 261 $translations{$from} = {}; 262 while (defined ($line = <MAPFILE>)) { 263 # remove comments 264 $line =~ s/\#.*$//; 265 next unless $line =~ /\S/; 266 267 # split the line into fields and do a few 268 # simple sanity checks 269 @line = split (/\t/, $line); 270 next unless (scalar(@line) >= 2 && 271 $line[0] =~ /^0x/ && 272 $line[1] =~ /^0x/); 273 274 my $a = hex($line[0]); 275 my $b = hex($line[1]); 276 277 $translations{$to}->{$a} = $b; 278 $translations{$from}->{$b} = $a; 279 } 280 281 close (MAPFILE); 282 283 return 1; 284 } 285 212 286 213 287 1; 214
Note:
See TracChangeset
for help on using the changeset viewer.