Ignore:
Timestamp:
2018-06-21T21:41:12+12:00 (6 years ago)
Author:
ak19
Message:

First set of commits to do with implementing the new 'paged_html' output option of PDFPlugin that uses using xpdftools' new pdftohtml. So far tested only on Linux (64 bit), but things work there so I'm optimistically committing the changes since they work. 2. Committing the pre-built Linux binaries of XPDFtools for both 32 and 64 bit built by the XPDF group. 2. To use the correct bitness variant of xpdftools, setup.bash now exports the BITNESS env var, consulted by gsConvert.pl. 3. All the perl code changes to do with using xpdf tools' pdftohtml to generate paged_html and feed it in the desired form into GS(3): gsConvert.pl, PDFPlugin.pm and its parent ConvertBinaryPFile.pm have been modified to make it all work. xpdftools' pdftohtml generates a folder containing an html file and a screenshot for each page in a PDF (as well as an index.html linking to each page's html). However, we want a single html file that contains each individual 'page' html's content in a div, and need to do some further HTML style, attribute and structure modifications to massage the xpdftool output to what we want for GS. In order to parse and manipulate the HTML 'DOM' to do this, we're using the Mojo::DOM package that Dr Bainbridge found and which he's compiled up. Mojo::DOM is therefore also committed in this revision. Some further changes and some display fixes are required, but need to check with the others about that.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm

    r31494 r32205  
    2727use strict;
    2828no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
     29no strict 'subs'; # allow filehandles to be variables and viceversa
    2930
    3031use ReadTextFile;
    3132use unicode;
     33use Mojo::DOM; # for HTML parsing
    3234
    3335use AutoLoadConverters;
     
    4446      { 'name' => "text",
    4547    'desc' => "{ConvertBinaryFile.convert_to.text}" },
     48      { 'name' => "paged_html",
     49    'desc' => "{PDFPlugin.convert_to.paged_html}"},
    4650      { 'name' => "pagedimg_jpg",
    4751    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
     
    145149
    146150    # check convert_to
     151    # TODO: Start supporting PDF to txt on Windows if we're going to be using XPDF Tools (incl pdftotext) on Windows/Linux/Mac
    147152    if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
    148153    print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
     
    281286    my ($conv_filename) = @_;
    282287
     288    my $outhandle=$self->{'outhandle'};
     289#    print STDERR "@@@ convert_to: ".$self->{'convert_to'}."\n";
     290
     291    if($self->{'convert_to'} eq "paged_html") {
     292    # special post-processing for paged_html mode, as HTML pages generated
     293    # by xpdf's pdftohtml need to be massaged into the form we want
     294    $self->xpdftohtml_convert_post_process($conv_filename);
     295    }
     296    else { # use PDFPlugin's usual post processing
     297    $self->default_convert_post_process($conv_filename);
     298    }
     299}
     300
     301# Called after gsConvert.pl has been run to convert a PDF to paged_html
     302# using Xpdftools' pdftohtml
     303# This method will do some cleanup of the HTML files produced after XPDF has produced
     304# an HTML doc for each PDF page: it first gets rid of the default index.html.
     305# Instead, it constructs a single html page containing each original HTML page
     306# <body> nested as divs instead, with simple section information inserted at the top
     307# of each 'page' <div> and some further styling customisation. This HTML manipulation
     308# is to be done with the Mojo::DOM perl package.
     309# Note that since xpdf's pdftohtml would have failed if the output dir already
     310# existed and for simpler naming, the output files are created in a new "pages"
     311# subdirectory of the tmp location parent of $conv_filename instead
     312sub xpdftohtml_convert_post_process
     313{
     314    my $self = shift (@_);
     315    my ($output_filename) = @_; # output_filename = tmp location + filename
     316    # if a single html were generated.
     317    # We just want the tmp location, append "pages", and read all the html files
     318    # in except for index.html. Then we create a new html file by name
     319    # $output_filename, which will consist of a slightly modified version of
     320    # each of the other html files concatenated together.
     321
     322    my $outhandle=$self->{'outhandle'};
     323
     324    my ($tailname, $tmp_dir, $suffix)
     325    = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$");
     326    my $pages_subdir = &FileUtils::filenameConcatenate($tmp_dir, "pages");
     327
     328    # Code from util::create_itemfile()
     329    # Read in all the files
     330    opendir(DIR, $pages_subdir) || die "can't opendir $pages_subdir: $!";
     331    my @page_files = grep {-f "$pages_subdir/$_"} readdir(DIR);
     332    closedir DIR;
     333    # Sort files in the directory by page_num
     334    # files are named index.html, page1.html, page2.html, ..., pagen.html
     335    sub page_number {
     336    my ($dir) = @_;
     337    my ($pagenum) =($dir =~ m/^page(\d+)\.html?$/i);
     338    $pagenum = 0 unless defined $pagenum; # index.html will be given pagenum=0
     339    return $pagenum;
     340    }
     341    # sort the files in the directory in the order of page_num rather than lexically.
     342    @page_files = sort { page_number($a) <=> page_number($b) } @page_files;
     343
     344    #my $num_html_pages = (scalar(@page_files) - 1)/2; # skip index file.
     345              # For every html file there's an img file, so halve the total num.
     346              # What about other file types that may potentially be there too???
     347    my $num_html_pages = 0;
     348    foreach my $pagefile (@page_files) {
     349    $num_html_pages++ if $pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i;
     350    }
     351
     352    # Prepare to create our new html page that will contain all the individual
     353    # htmls generated by xpdf's pdftohtml in sequence.
     354    # First write the opening html tags out to the output file. These are the
     355    # same tags and their contents, including <meta>, as is generated by
     356    # Xpdf's pdftohtml for each of its individual html pages.
     357    my $start_text = "<html>\n<head>\n";
     358    $start_text .= "<title>$tailname</title>\n";
     359    $start_text .= "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n";
     360    $start_text .= "</head>\n<body>\n\n";
     361
     362    #handle content encodings the same way that default_convert_post_process does
     363    # $self->utf8_write_file ($start_text, $conv_filename); # will close file after write   
     364    # Don't want to build a giant string in memory of all the pages concatenated
     365    # and then write it out in one go. Instead, build up the final single page
     366    # by writing each modified paged_html file out to it as this is processed.
     367    # Copying file open/close code from CommonUtil::utf8_write_file()
     368    if (!open (OUTFILE, ">:utf8", $output_filename)) {
     369    gsprintf(STDERR, "PDFPlugin::xpdftohtml_convert_post_process {ConvertToPlug.could_not_open_for_writing} ($!)\n", $output_filename);
     370    die "\n";
     371    }
     372    print OUTFILE $start_text;
     373
     374    # Get the contents of each individual HTML page generated by Xpdf, after first
     375    # modifying each, and write each out into our single all-encompassing html
     376    foreach my $pagefile (@page_files) {
     377    if ($pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i) {
     378        my $page_num = page_number($pagefile);   
     379        # get full path to pagefile
     380        $pagefile = &FileUtils::filenameConcatenate($pages_subdir, $pagefile);
     381#       print STDERR "@@@ About to process html file $pagefile (num $page_num)\n";
     382        my $modified_page_contents = $self->_process_paged_html_page($pagefile, $page_num, $num_html_pages);
     383        print OUTFILE "$modified_page_contents\n\n";
     384    }
     385    }
     386
     387    # we've now created a single HTML file by concatenating (a modified version)
     388    # of each paged html file
     389    print OUTFILE "</body>\n</html>\n"; # write out closing tags
     390    close OUTFILE; # done
     391
     392    # Get rid of all the htm(l) files incl index.html in the associated "pages"
     393    # subdir, since we've now processed them all into a single html file
     394    # one folder level up and we don't want HTMLPlugin to process all of them next.
     395#    my @fullpath_page_files = map { &FileUtils::filenameConcatenate($pages_subdir, $_) } @page_files;
     396    &FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); #  no specific whitelist, but blacklist htm(l)
     397
     398    # now the tmp area should contain a single html file contain all the html pages'
     399    # contents in sequence, and a "pages" subdir containing the screenshot images
     400    # of each page.   
     401    # HTMLPlugin will process these further in the plugin pipeline
     402}
     403
     404# For whatever reason, most html <tags> don't get printed out in GLI
     405# So when debugging, use this function to print them out as [tags] instead.
     406sub _debug_print_html
     407{
     408    my $self = shift (@_);
     409    my ($string_or_dom) = @_;
     410
     411    # can't seem to determine type of string with ref/reftype
     412    # https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable
     413
     414    # $dom objects appear to get correctly stringified in string contexts
     415    # $dom.to_string/$dom.stringify seem to get called, no need to call them
     416    # https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl
     417    my $escapedTxt = $string_or_dom;
     418    $escapedTxt =~ s@\<@[@sg;
     419    $escapedTxt =~ s@\>@]@sg;
     420
     421    print STDERR "#### $escapedTxt\n";
     422}
     423
     424# Helper function to read in each paged_html generated by Xpdf's pdftohtml
     425# then modify the html suitably using the HTML parsing functions offered by
     426# Mojo::DOM, then return the modified HTML content as a string
     427# See https://mojolicious.org/perldoc/Mojo/DOM
     428sub _process_paged_html_page
     429{
     430    my $self = shift (@_);
     431    my ($pagefile, $page_num, $num_html_pages) = @_;
     432
     433    my $text = "";
     434
     435    # handling content encoding the same way default_convert_post_process does
     436    $self->read_file ($pagefile, "utf8", "", \$text);
     437
     438    my $dom = Mojo::DOM->new($text);
     439
     440#    $self->_debug_print_html($dom);
     441
     442    # there's a <style> element on the <html>, we need to shift it into the <div>
     443    # tag that we'll be creating. We'll first slightly modify the <style> element
     444    # store the first style element, which is the only one and in the <body>
     445    # we'll later insert it as child of an all-encompassing div that we'll create
     446#    my $page_style_tag_str = $dom->find('style')->[0]->to_string;
     447#    my $page_style_tag_str = $dom->find('html style')->[0]->to_string;
     448    my $page_style_tag_str = $dom->at('html')->at('style')->to_string;
     449    # In the style tag, convert id style references to class style references
     450    my $css_class = ".p".$page_num."f";
     451    $page_style_tag_str =~ s@\#f@$css_class@sg;
     452    my $style_element = Mojo::DOM->new($page_style_tag_str)->at('style'); # modified   
     453#$self->_debug_print_html($style_element);
     454
     455    # need to know the image's height to set the height of the surrounding
     456    # div that's to replace this page's <body>:
     457    my $img_height = $dom->find('img')->[0]{height};
     458
     459
     460    # 1. Fix up the style attr on the image by additionally setting z-index=-1 for it
     461    # 2. Adjust the img#background src attribute to point to the pages subdir for imgs
     462    # 3. Set that img tag's class=background, and change its id to background+$page_num
     463    my $bg_img_tag=$dom->find('img#background')->[0];
     464
     465    my $img_style_str = $bg_img_tag->{style}; # = $dom->find('img#background')->[0]{style}
     466    $img_style_str = $img_style_str." z-index=-1;";
     467#print STDERR "img_style_str: " . $img_style_str."\n";
     468    my $img_src_str = $bg_img_tag->{src};
     469    $img_src_str = "pages/$img_src_str";
     470    $bg_img_tag->attr({style => $img_style_str, src => $img_src_str}); # reset
     471#$self->_debug_print_html($bg_img_tag);
     472    # set both class and modified id attributes in one step:
     473    $bg_img_tag->attr({class => "background", id => "background".$page_num});
     474#$self->_debug_print_html($bg_img_tag);
     475
     476    # get all the <span> nested inside <div class="txt"> elements and
     477    # 1. set their class attr to be "p + page_num + id-of-the-span",
     478    # 2. then delete the id, because the span ids have been reused when element
     479    # ids ought to be unique. Which is why we set the modified ids to be the
     480    # value of the class attribute instead
     481    $dom->find('div.txt span')->each(sub {
     482    $_->attr(class => "p". $page_num. $_->{id});
     483    delete $_->{id};
     484                     }); # both changes done in one find() operation
     485#$self->_debug_print_html($dom->find('div.txt span')->last);
     486
     487    # Finally can create our new dom, starting with a div tag for the current page
     488    # Must be: <div id="$page_num" style="position:relative; height:$img_height;"/>
     489    my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" );
     490#$self->_debug_print_html($new_dom);
     491    $new_dom->at('div')->append_content($style_element)->root;
     492
     493    # Append a page range bucket heading if applicable
     494    # Dr Bainbridge thinks for now we need only consider PDFs where the
     495    # total number of pages < 1000 and create buckets of size 10 (e.g. 1-10, ... 51-60, ...)
     496    # If number of remaining pages >= 10, then create new bucket heading
     497    # e.g. "Pages 30-40"
     498    if(($num_html_pages - $page_num) > 10) {
     499    # Double-digit page numbers that start with 2
     500    # i.e. 21 to 29 (and 30) should be in 21 to 30 range
     501    my $start_range = $page_num - ($page_num % 10) + 1;
     502    my $end_range = $page_num + 10 - ($page_num % 10);
     503    if($page_num % 10 == 0) { # page 20 however, should be in 11 to 20 range
     504        $start_range -= 10;
     505        $end_range -= 10;
     506    }
     507    $new_dom->at('div')->append_content($new_dom->new_tag('h1', "Pages ".$start_range . "-" . $end_range))->root;
     508    }
     509
     510    # Add a simpler heading: just the pagenumber, "Page #"
     511    $new_dom->at('div')->append_content($new_dom->new_tag('h2', "Page ".$page_num))->root;
     512#$self->_debug_print_html($new_dom);
     513    # Copy across all the old html's body tag's child nodes into the new dom's new div tag
     514    $dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string
     515#$self->_debug_print_html($new_dom);
     516
     517    # Finished processing a single html page of the paged_html output generated by
     518    # Xpdf's pdftohtml: finished massaging that single html page into the right form
     519    return $new_dom->to_string;
     520}
     521
     522# This subroutine is called to do the PDFPlugin post-processing for all cases
     523# except the "paged_html" conversion mode. This is what PDFPlugin always used to do:
     524sub default_convert_post_process
     525{
     526    my $self = shift (@_);
     527    my ($conv_filename) = @_;
    283528    my $outhandle=$self->{'outhandle'};
    284529
Note: See TracChangeset for help on using the changeset viewer.