Changeset 7798 for trunk/gsdl
- Timestamp:
- 2004-07-22T12:13:45+12:00 (20 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/doc.pm
r7569 r7798 788 788 } 789 789 790 # add_utf8_metadata assumes the text has already been converted791 # to the UTF-8 encoding.792 790 sub add_utf8_metadata { 793 791 my $self = shift (@_); … … 809 807 } 810 808 809 # double check that the value is utf-8 810 if (unicode::ensure_utf8(\$value)) { 811 print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n"; 812 } 813 811 814 push (@{$section_ptr->{'metadata'}}, [$field, $value]); 812 815 } -
trunk/gsdl/perllib/unicode.pm
r4229 r7798 6 6 # University of Waikato, New Zealand. 7 7 # 8 # Copyright (C) 1999 New Zealand Digital Library Project8 # Copyright (C) 1999-2004 New Zealand Digital Library Project 9 9 # 10 10 # This program is free software; you can redistribute it and/or modify … … 468 468 } 469 469 470 471 # this makes sure that the referenced input string is utf8 encoded, and 472 # will change/remove bytes that aren't. 473 # returns 0 if the text was already utf8, or 1 if text modified to become utf8 474 sub ensure_utf8 { 475 my $stringref=shift; 476 477 my $value=$$stringref; 478 479 my $non_utf8_found = 0; 480 $value =~ m/^/g; # to set \G 481 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) { 482 my $highbytes=$1; 483 my $highbyteslength=length($highbytes); 484 # make sure this block of high bytes is utf-8 485 $highbytes =~ /^/g; # set pos() 486 my $byte_replaced = 0; 487 while ($highbytes =~ 488 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8 489 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte 490 [\xf0-\xf7][\x80-\xbf]{3} # 4 byte 491 [\xf8-\xfb][\x80-\xbf]{4} # 5 byte 492 [\xfc-\xfd][\x80-\xbf]{5} # 6 byte 493 )*([\x80-\xff])? !xg 494 ) { 495 # this highbyte is "out-of-place" for valid utf-8 496 my $badbyte=$1; 497 if (!defined $badbyte) {next} # hit end of string 498 my $pos=pos($highbytes); 499 # replace bad byte. assume iso-8859-1 -> utf-8 500 # ascii2utf8 does "extended ascii"... ie iso-8859-1 501 my $replacement=&unicode::ascii2utf8(\$badbyte); 502 substr($highbytes, $pos-1, 1, $replacement); 503 # update the position to continue searching (for \G) 504 pos($highbytes) = $pos+length($replacement)-1; 505 $byte_replaced = 1; 506 } 507 if ($byte_replaced) { 508 # replace this block of high bytes in the $value 509 $non_utf8_found = 1; 510 my $replength=length($highbytes); # we've changed the length 511 my $textpos=pos($value); # pos at end of last match 512 # replace bad bytes with good bytes 513 substr($value, $textpos-$highbyteslength, 514 $highbyteslength, $highbytes); 515 # update the position to continue searching (for \G) 516 pos($value)=$textpos+($replength-$highbyteslength)+1; 517 } 518 } 519 520 $stringref = \$value; 521 return $non_utf8_found; 522 } 523 470 524 1;
Note:
See TracChangeset
for help on using the changeset viewer.