Changeset 32089

2017-12-08T19:16:12+13:00 (5 years ago)
  1. Attempted fix by Kathy and me for Diego's problem of PDFBox's handling of a PDF. When it was set to convert_to_html, it built fine, but convert_to_text produced something that was invalid XML in doc.XML and build failed. Diego reasoned correctly that building ought to succeed in both cases if it succeeded in one case. Kathy found the correct fix for escaping the ampersand character (it wasn't & to & that I'd attempted, nor did using HTML::Entities' encode work either). 2. The fix needed to read and write files, so introducing readUTF8File() and writeUTF8File() into for reusability. Need to still contact John Thompson to ask him if and how these functions need to be modified to support parallel processing, for which FileUtils was written.
2 edited


  • gs2-extensions/pdf-box/trunk/java/perllib/plugins/

    r27510 r32089  
    3232no strict 'subs'; # allow barewords (eg STDERR) as function arguments
     34#use HTML::Entities; # for encoding characters into their HTML entities when PDFBox converts to text
    3436use gsprintf 'gsprintf';
     37use FileUtils;
    3639# these two variables mustn't be initialised here or they will get stuck
    257260    #print STDERR "**** item file: $target_file_path\n";
    258261    }
     262    elsif ($self->{'converted_to'} eq "text") {
     263    # ensure html entities are doubly escaped for pdfbox to text conversion: & -> &
     264    # conversion to html does it automatically, but conversion to text doesn't
     265    # and this results in illegal characters in doc.xml
     267    my $fulltext = &FileUtils::readUTF8File($target_file_path);
     268    #$fulltext = &HTML::Entities::encode($fulltext); # doesn't seem to help
     269    $fulltext =~ s@&@&@sg; # Kathy's fix to ensure doc contents don't break XML
     270    &FileUtils::writeUTF8File($target_file_path, \$fulltext);
     271    }
    260273    if ($had_error) {
    261274    return (0, $result,$target_file_path);
  • main/trunk/greenstone2/perllib/

    r31724 r32089  
    6060# util::soft_link()            => FileUtils::softLink()
     62# Functions that have been added, but not by John Thompson,
     63# So the implementations don't support parallel processing yet, but they print a warning and the
     64# correct implementation can be put into here. So that if all calls for reading and writing UTF8
     65# file content go through here, then they will do the right thing when the functions are updated.
     67#  => FileUtils::readUTF8File()
     68#  => FileUtils::writeUTF8File()
    6271# Other functions in this file (perhaps some of these may have counterparts in too):
    962971## readDirectory()
     973## @function readUTF8File()
     975# read contents from a file containing UTF8.
     977# Parameter filename, the filepath to read from
     979sub readUTF8File
     981    my $filename = shift(@_);
     983    print STDERR "@@@ Warning FileUtils::readFile() not yet implemented for parallel processing. Using regular version...\n";
     985    open(FIN,"<$filename") or die "FileUtils::readFile: Unable to open $filename for reading...ERROR: $!\n";
     987    # decode the bytes in the file with UTF8 enc,
     988    # to get unicode aware strings that represent utf8 chars
     989    binmode(FIN,":utf8");
     991    my $contents;
     992    # Read in the entire contents of the file in one hit
     993    sysread(FIN, $contents, -s FIN);
     994    close(FIN);
     995    return $contents;   
     997## readUTF8File()
     999## @function writeUTF8File()
     1001# write UTF8 contents to a file.
     1003# Parameter filename, the filepath to write to
     1004# Parameter contentRef, a *reference* to the contents to write out
     1006sub writeUTF8File
     1008    my ($filename, $contentRef) = @_;
     1010    print STDERR "@@@ Warning FileUtils::writeFile() not yet implemented for parallel processing. Using regular version...\n";
     1012    open(FOUT, ">$filename") or die "FileUtils::writeFile: Unable to open $filename for writing out contents...ERROR: $!\n";
     1013    # encode the unicode aware characters in the string as utf8
     1014    # before writing out the resulting bytes
     1015    binmode(FOUT,":utf8");
     1017    print FOUT $$contentRef;
     1018    close(FOUT);
     1020## writeUTF8File()
    9641022## @function removeFiles()
Note: See TracChangeset for help on using the changeset viewer.