Changeset 35164


Ignore:
Timestamp:
2021-05-17T09:46:29+12:00 (3 years ago)
Author:
kjdon
Message:

xpdf seems to output surrogate pairs into the html - these end up being invalid in utf8. Detect these and convert to proper characters, which can then be written out as valid utf8.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/PDFv2Plugin.pm

    r32778 r35164  
    443443#       print STDERR "@@@ About to process html file $pagefile (num $page_num)\n";
    444444        my $modified_page_contents = $self->_process_pretty_html_page($pagefile, $page_num, $num_html_pages);
     445        # check for surrogates which are illegal in utf8 - need converting to proper codepoint
     446        if ($modified_page_contents =~ /[\x{D800}-\x{DFFF}]/) {
     447        $modified_page_contents =~ s/([\x{D800}-\x{DBFF}])([\x{DC00}-\x{DFFF}])/desurrogate($1, $2)/ge;
     448        }
    445449        print OUTFILE "$modified_page_contents\n\n";
    446450    }
     
    461465    # of each page.   
    462466    # HTMLPlugin will process these further in the plugin pipeline
     467}
     468
     469# Turn a surrogate pair into a character
     470sub desurrogate
     471{
     472    my ($hi, $lo) = @_;
     473    my $codepoint = 0x10000 + (ord($hi) - 0xD800) * 0x400 + (ord($lo) - 0xDC00);
     474    return chr($codepoint);
    463475}
    464476
Note: See TracChangeset for help on using the changeset viewer.