Changeset 35164
- Timestamp:
- 2021-05-17T09:46:29+12:00 (3 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/PDFv2Plugin.pm
r32778 r35164 443 443 # print STDERR "@@@ About to process html file $pagefile (num $page_num)\n"; 444 444 my $modified_page_contents = $self->_process_pretty_html_page($pagefile, $page_num, $num_html_pages); 445 # check for surrogates which are illegal in utf8 - need converting to proper codepoint 446 if ($modified_page_contents =~ /[\x{D800}-\x{DFFF}]/) { 447 $modified_page_contents =~ s/([\x{D800}-\x{DBFF}])([\x{DC00}-\x{DFFF}])/desurrogate($1, $2)/ge; 448 } 445 449 print OUTFILE "$modified_page_contents\n\n"; 446 450 } … … 461 465 # of each page. 462 466 # HTMLPlugin will process these further in the plugin pipeline 467 } 468 469 # Turn a surrogate pair into a character 470 sub desurrogate 471 { 472 my ($hi, $lo) = @_; 473 my $codepoint = 0x10000 + (ord($hi) - 0xD800) * 0x400 + (ord($lo) - 0xDC00); 474 return chr($codepoint); 463 475 } 464 476
Note:
See TracChangeset
for help on using the changeset viewer.