Ignore:
Timestamp:
2018-06-21T21:41:12+12:00 (6 years ago)
Author:
ak19
Message:

First set of commits to do with implementing the new 'paged_html' output option of PDFPlugin that uses using xpdftools' new pdftohtml. So far tested only on Linux (64 bit), but things work there so I'm optimistically committing the changes since they work. 2. Committing the pre-built Linux binaries of XPDFtools for both 32 and 64 bit built by the XPDF group. 2. To use the correct bitness variant of xpdftools, setup.bash now exports the BITNESS env var, consulted by gsConvert.pl. 3. All the perl code changes to do with using xpdf tools' pdftohtml to generate paged_html and feed it in the desired form into GS(3): gsConvert.pl, PDFPlugin.pm and its parent ConvertBinaryPFile.pm have been modified to make it all work. xpdftools' pdftohtml generates a folder containing an html file and a screenshot for each page in a PDF (as well as an index.html linking to each page's html). However, we want a single html file that contains each individual 'page' html's content in a div, and need to do some further HTML style, attribute and structure modifications to massage the xpdftool output to what we want for GS. In order to parse and manipulate the HTML 'DOM' to do this, we're using the Mojo::DOM package that Dr Bainbridge found and which he's compiled up. Mojo::DOM is therefore also committed in this revision. Some further changes and some display fixes are required, but need to check with the others about that.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/gsConvert.pl

    r30724 r32205  
    323323
    324324    # Attempt conversion to HTML
    325     if (!$output_type || ($output_type =~ m/html/i)) {
     325    # Uses the old pdftohtml that doesn't work for newer PDF versions
     326    #if ($output_type =~ m/^html/i) {
     327    if (!$output_type || ($output_type =~ m/^html/i)) {
    326328    $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
    327329    if ($success) {
    328330        return "html";
     331    }
     332    }
     333
     334    # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This
     335    # will be the new default for PDFs when output_type for PDF docs is not specified
     336    # (once our use of xpdftools' pdftohtml has been implemented on win and mac).
     337    if ($output_type =~ m/paged_html/i) {
     338    #if (!$output_type || ($output_type =~ m/paged_html/i)) {
     339    $success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
     340    if ($success) {
     341        return "paged_html";
    329342    }
    330343    }
     
    756769
    757770
    758 # Convert a pdf file to html with the pdftohtml command
    759 
     771# Convert a pdf file to html with the old pdftohtml command
     772# which only works for older PDF versions
    760773sub pdf_to_html {
    761774    my ($dirname, $input_filename, $output_filestem) = @_;
     
    819832    return 1;
    820833}
     834
     835
     836# Convert a pdf file to html with the newer Xpdftools' pdftohtml
     837# This generates "paged HTML" where extracted, selectable text is positioned
     838# over screenshots of each page.
     839# Since xpdf's pdftohtml fails if the output dir already exists and for easier
     840# naming, the output files are created in a "pages" subdirectory of the tmp
     841# location parent of $output_filestem instead
     842sub xpdf_to_html {
     843    my ($dirname, $input_filename, $output_filestem) = @_;
     844
     845    my $cmd = "";
     846
     847    # build up the path to the doc-to-html conversion tool we're going to use
     848    my $xpdf_pdftohtml = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools");
     849
     850    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
     851    # TODO
     852    } elsif ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
     853    # TODO
     854    } else { # unix, use the appropriate bin folder for the bitness of the system
     855
     856    # Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since
     857    # $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit
     858    # specific subdirectories exist in a greenstone installation.
     859    # None of those locations need exist when xpdf-tools is installed with GS.
     860    # So don't depend on GSDLARCH as forcing that to be exported has side-effects
     861    if($ENV{'BITNESS'}) {
     862        $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin".$ENV{'BITNESS'});
     863    } else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64?
     864        $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32");
     865    }
     866    }
     867
     868    # We'll create the file by name $output_filestem during post-conversion processing.
     869    # Note that Xpdf tools will only create its conversion products in a dir that does
     870    # not yet exist. So we'll create this location as a subdir of the output_filestem's
     871    # parent directory. The parent dir is the already generated tmp area for conversion. So:
     872    # - tmpdir gs2build/tmp/<random-num> already exists at this stage
     873    # - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
     874    # - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
     875    my ($tailname, $tmp_dirname, $suffix)
     876    = &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
     877    $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
     878
     879    $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "pdftohtml");
     880    # xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
     881    $cmd .= "\"$xpdf_pdftohtml\"";
     882    $cmd .= " -z $pdf_zoom" if ($pdf_zoom);
     883#    $cmd .= " -c" if ($pdf_complex);
     884#    $cmd .= " -i" if ($pdf_ignore_images);
     885#    $cmd .= " -a" if ($pdf_allow_images_only);
     886#    $cmd .= " -hidden" unless ($pdf_nohidden);   
     887    $cmd .= " \"$input_filename\" \"$tmp_dirname\"";
     888    #$cmd .= " \"$input_filename\" \"$output_filestem\"";
     889
     890    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
     891    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
     892    } else {
     893    $cmd .= " > \"$output_filestem.err\"";
     894    }
     895
     896    #print STDERR "@@@@ Running command: $cmd\n";
     897
     898    $!=0;
     899    my $retval=system($cmd);
     900    if ($retval!=0)
     901    {
     902    print STDERR "Error executing xpdf's pdftohtml tool";
     903    if ($!) {print STDERR ": $!";}
     904    print STDERR "\n";
     905    }
     906
     907    # make sure the converter made something
     908    if ($retval!=0 || ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
     909    {
     910    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
     911    # print out the converter's std err, if any
     912    if (-s "$output_filestem.err") {
     913        open (ERRLOG, "$output_filestem.err") || die "$!";
     914        print STDERR "pdftohtml error log:\n";
     915        while (<ERRLOG>) {
     916        print STDERR "$_";
     917        }
     918        close ERRLOG;
     919    }
     920    #print STDERR "***********output filestem $output_filestem.html\n";
     921    &FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
     922    if (-e "$output_filestem.err") {
     923        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
     924        {
     925        open (ERRLOG, "$output_filestem.err");
     926        while (<ERRLOG>) {print FAILLOG $_;}
     927        close ERRLOG;
     928        close FAILLOG;
     929        }   
     930        &FileUtils::removeFiles("$output_filestem.err");
     931    }
     932    return 0;
     933    }
     934
     935    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
     936    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
     937    return 1;
     938}
     939
     940
    821941
    822942# Convert a pdf file to various types of image with the convert command
Note: See TracChangeset for help on using the changeset viewer.