Changeset 32289


Ignore:
Timestamp:
2018-07-18T21:11:21+12:00 (6 years ago)
Author:
ak19
Message:

The PDFPlugin is being deprecated (since PDFv1 and PDFv2 plugins are replacing it). PDFPlugin itself will be around for migrating users, but contained code added to support paged_html/xpdftools before the plugin was refactored into v1 and v2. This commit removes all the xpdftools/paged_html related changes added since revision 31494 to the deprecated PDFPlugin, so that it's back to using just the old pdftohtml tool and the pdf-box extension, since the xpdftools stuff has been moved into PDFv2Plugin. Some other changes since revision 31494 like deprecation messages and use of translation strings instead of hardcoded English language strings remain in PDFPlugin.pm as they're generally relevant.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm

    r32277 r32289  
    3131use ReadTextFile;
    3232use unicode;
    33 use Mojo::DOM; # for HTML parsing
    3433
    3534use AutoLoadConverters;
     
    4645      { 'name' => "text",
    4746    'desc' => "{ConvertBinaryFile.convert_to.text}" },
    48       { 'name' => "paged_html",
    49     'desc' => "{PDFPlugin.convert_to.paged_html}"},
    5047      { 'name' => "pagedimg_jpg",
    5148    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
     
    9794       'desc' => "{PDFPlugin.zoom}",
    9895       'deft' => "2",
    99 #       'range' => "1,3", # actually the range is 0.5-3
    100        'type' => "string" },
     96       'range' => "1,3", # actually the range is 0.5-3
     97       'type' => "int" },
    10198     { 'name' => "use_sections",
    10299       'desc' => "{PDFPlugin.use_sections}",
     
    164161    elsif ($self->{'convert_to'} eq "auto") {
    165162    # choose html ?? is this the best option
    166     $self->{'convert_to'} = "paged_html";
     163    $self->{'convert_to'} = "html";
    167164    }
    168165    if ($self->{'use_realistic_book'}) {
     
    215212        push(@$specific_options, "-use_realistic_book");
    216213    }
    217         if($self->{'convert_to'} eq "paged_html") { # for paged html, the default should be to sectionalise on headings the single superpage containing divs representing individual pages as section
    218             push(@$specific_options, "sectionalise_using_h_tags");
    219         }
    220214    }
    221215    elsif ($secondary_plugin_name eq "PagedImagePlugin") {
     
    292286}
    293287
    294 # Overriding to do some extra handling for paged_html output mode
    295 sub run_conversion_command {
    296     my $self = shift (@_);
    297     my ($tmp_dirname, $tmp_inputPDFname, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;   
    298 
    299     if($self->{'convert_to'} ne "paged_html") {
    300     return $self->ConvertBinaryFile::run_conversion_command(@_);
    301     }
    302 
    303     # if output mode is paged_html, we use Xpdf tools' pdftohtml and tell it
    304     # to create a subdir called "pages" in the tmp area to puts its products
    305     # in there. (Xpdf's pdftohtml needs to be passed a *non-existent* directory
    306     # parameter, the "pages" subdir). If Xpdf's pdftohtml has successfully run,
    307     # the intermediary output file tmp/<random-num>/pages/index.html should
    308     # exist (besides other output products there)
    309 
    310     # We let ConvertBinaryFile proceed normally, but the return value should reflect
    311     # that on success it should expect the intermediary product tmpdir/pages/index.html
    312     # (which is the product of xpdftohtml conversion).
    313     my $output_filename = $self->ConvertBinaryFile::run_conversion_command(@_);
    314     $output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "pages", "index.html");
    315 
    316     # However, when convert_post_process() is done, it should have output the final
    317     # product of the paged_html conversion: an html file of the same name and in the
    318     # same tmp location as the input PDF file.
    319 
    320     my ($name_prefix, $output_dir, $ext)
    321     = &File::Basename::fileparse($tmp_inputPDFname, "\\.[^\\.]+\$");
    322     $self->{'conv_filename_after_post_process'} = &FileUtils::filenameConcatenate($output_dir, $name_prefix.".html");
    323 #    print STDERR "@@@@@ final paged html file will be: " . $self->{'conv_filename_after_post_process'} . "\n";
    324 
    325     return $output_filename;
    326 }
    327 
    328288sub convert_post_process
    329289{
     
    331291    my ($conv_filename) = @_;
    332292
    333     my $outhandle=$self->{'outhandle'};
    334 
    335     if($self->{'convert_to'} eq "paged_html") {
    336     # special post-processing for paged_html mode, as HTML pages generated
    337     # by xpdf's pdftohtml need to be massaged into the form we want
    338     $self->xpdftohtml_convert_post_process($conv_filename);
    339     }
    340     else { # use PDFPlugin's usual post processing
    341     $self->default_convert_post_process($conv_filename);
    342     }
    343 }
    344 
    345 # Called after gsConvert.pl has been run to convert a PDF to paged_html
    346 # using Xpdftools' pdftohtml
    347 # This method will do some cleanup of the HTML files produced after XPDF has produced
    348 # an HTML doc for each PDF page: it first gets rid of the default index.html.
    349 # Instead, it constructs a single html page containing each original HTML page
    350 # <body> nested as divs instead, with simple section information inserted at the top
    351 # of each 'page' <div> and some further styling customisation. This HTML manipulation
    352 # is to be done with the Mojo::DOM perl package.
    353 # Note that since xpdf's pdftohtml would have failed if the output dir already
    354 # existed and for simpler naming, the output files are created in a new "pages"
    355 # subdirectory of the tmp location parent of $conv_filename instead
    356 sub xpdftohtml_convert_post_process
    357 {
    358     my $self = shift (@_);
    359     my ($pages_index_html) = @_; # = tmp/<rand>/pages/index.html for paged_html output mode
    360     my $output_filename = $self->{'conv_filename_after_post_process'};
    361 
    362     # Read in all the html files in tmp's "pages" subdir, except for index.html.
    363     # and use it to create a new html file called $self->{'conv_filename_after_post_process'}
    364     # which will consist of a slightly modified version of
    365     # each of the other html files concatenated together.
    366 
    367     my $outhandle=$self->{'outhandle'};
    368 
    369     my ($tailname, $pages_subdir, $suffix)
    370     = &File::Basename::fileparse($pages_index_html, "\\.[^\\.]+\$");
    371 
    372     # Code from util::create_itemfile()
    373     # Read in all the files
    374     opendir(DIR, $pages_subdir) || die "can't opendir $pages_subdir: $!";
    375     my @page_files = grep {-f "$pages_subdir/$_"} readdir(DIR);
    376     closedir DIR;
    377     # Sort files in the directory by page_num
    378     # files are named index.html, page1.html, page2.html, ..., pagen.html
    379     sub page_number {
    380     my ($dir) = @_;
    381     my ($pagenum) =($dir =~ m/^page(\d+)\.html?$/i);
    382     $pagenum = 0 unless defined $pagenum; # index.html will be given pagenum=0
    383     return $pagenum;
    384     }
    385     # sort the files in the directory in the order of page_num rather than lexically.
    386     @page_files = sort { page_number($a) <=> page_number($b) } @page_files;
    387 
    388     #my $num_html_pages = (scalar(@page_files) - 1)/2; # skip index file.
    389               # For every html file there's an img file, so halve the total num.
    390               # What about other file types that may potentially be there too???
    391     my $num_html_pages = 0;
    392     foreach my $pagefile (@page_files) {
    393     $num_html_pages++ if $pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i;
    394     }
    395 
    396     # Prepare to create our new html page that will contain all the individual
    397     # htmls generated by xpdf's pdftohtml in sequence.
    398     # First write the opening html tags out to the output file. These are the
    399     # same tags and their contents, including <meta>, as is generated by
    400     # Xpdf's pdftohtml for each of its individual html pages.
    401     my $start_text = "<html>\n<head>\n";
    402     my ($output_tailname, $tmp_subdir, $html_suffix)
    403     = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$");
    404     $start_text .= "<title>$output_tailname</title>\n";
    405     $start_text .= "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n";
    406     $start_text .= "</head>\n<body>\n\n";
    407     $start_text .= "<h1>$output_tailname</h1>\n\n";
    408 
    409     #handle content encodings the same way that default_convert_post_process does
    410     # $self->utf8_write_file ($start_text, $conv_filename); # will close file after write   
    411     # Don't want to build a giant string in memory of all the pages concatenated
    412     # and then write it out in one go. Instead, build up the final single page
    413     # by writing each modified paged_html file out to it as this is processed.
    414     # Copying file open/close code from CommonUtil::utf8_write_file()
    415     if (!open (OUTFILE, ">:utf8", $output_filename)) {
    416     gsprintf(STDERR, "PDFPlugin::xpdftohtml_convert_post_process {CommonUtil.could_not_open_for_writing} ($!)\n", $output_filename);
    417     die "\n";
    418     }
    419     print OUTFILE $start_text;
    420 
    421     # Get the contents of each individual HTML page generated by Xpdf, after first
    422     # modifying each, and write each out into our single all-encompassing html
    423     foreach my $pagefile (@page_files) {
    424     if ($pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i) {
    425         my $page_num = page_number($pagefile);   
    426         # get full path to pagefile
    427         $pagefile = &FileUtils::filenameConcatenate($pages_subdir, $pagefile);
    428 #       print STDERR "@@@ About to process html file $pagefile (num $page_num)\n";
    429         my $modified_page_contents = $self->_process_paged_html_page($pagefile, $page_num, $num_html_pages);
    430         print OUTFILE "$modified_page_contents\n\n";
    431     }
    432     }
    433 
    434     # we've now created a single HTML file by concatenating (a modified version)
    435     # of each paged html file
    436     print OUTFILE "</body>\n</html>\n"; # write out closing tags
    437     close OUTFILE; # done
    438 
    439     # Get rid of all the htm(l) files incl index.html in the associated "pages"
    440     # subdir, since we've now processed them all into a single html file
    441     # one folder level up and we don't want HTMLPlugin to process all of them next.
    442     &FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); #  no specific whitelist, but blacklist htm(l)
    443 
    444     # now the tmp area should contain a single html file contain all the html pages'
    445     # contents in sequence, and a "pages" subdir containing the screenshot images
    446     # of each page.   
    447     # HTMLPlugin will process these further in the plugin pipeline
    448 }
    449 
    450 # For whatever reason, most html <tags> don't get printed out in GLI
    451 # So when debugging, use this function to print them out as [tags] instead.
    452 sub _debug_print_html
    453 {
    454     my $self = shift (@_);
    455     my ($string_or_dom) = @_;
    456 
    457     # can't seem to determine type of string with ref/reftype
    458     # https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable
    459     # Not needed, as $dom objects seem to get correctly stringified in string contexts
    460     # $dom.to_string/$dom.stringify seem to get called, no need to call them
    461     # https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl
    462     my $escapedTxt = $string_or_dom;
    463     $escapedTxt =~ s@\<@[@sg;
    464     $escapedTxt =~ s@\>@]@sg;
    465 
    466     print STDERR "#### $escapedTxt\n";
    467 }
    468 
    469 # Helper function to read in each paged_html generated by Xpdf's pdftohtml
    470 # then modify the html suitably using the HTML parsing functions offered by
    471 # Mojo::DOM, then return the modified HTML content as a string
    472 # See https://mojolicious.org/perldoc/Mojo/DOM
    473 sub _process_paged_html_page
    474 {
    475     my $self = shift (@_);
    476     my ($pagefile, $page_num, $num_html_pages) = @_;
    477 
    478     my $text = "";
    479 
    480     # handling content encoding the same way default_convert_post_process does
    481     $self->read_file ($pagefile, "utf8", "", \$text);
    482 
    483     my $dom = Mojo::DOM->new($text);
    484 
    485 #    $self->_debug_print_html($dom);
    486 
    487     # there's a <style> element on the <html>, we need to shift it into the <div>
    488     # tag that we'll be creating. We'll first slightly modify the <style> element
    489     # store the first style element, which is the only one and in the <body>
    490     # we'll later insert it as child of an all-encompassing div that we'll create
    491     my $page_style_tag_str = $dom->at('html')->at('style')->to_string;
    492     # In the style tag, convert id style references to class style references
    493     my $css_class = ".p".$page_num."f";
    494     $page_style_tag_str =~ s@\#f@$css_class@sg;
    495     my $style_element = Mojo::DOM->new($page_style_tag_str)->at('style'); # modified   
    496 #$self->_debug_print_html($style_element);
    497 
    498     # need to know the image's height to set the height of the surrounding
    499     # div that's to replace this page's <body>:
    500     my $img_height = $dom->find('img')->[0]{height};
    501 
    502     # 2. Adjust the img#background src attribute to point to the pages subdir for imgs
    503     # 3. Set that img tag's class=background, and change its id to background+$page_num
    504     my $bg_img_tag=$dom->find('img#background')->[0];
    505     my $img_src_str = $bg_img_tag->{src};
    506     $img_src_str = "pages/$img_src_str";
    507     $bg_img_tag->attr(src => $img_src_str); # reset
    508 #$self->_debug_print_html($bg_img_tag);
    509     # set both class and modified id attributes in one step:
    510     $bg_img_tag->attr({class => "background", id => "background".$page_num});
    511 #$self->_debug_print_html($bg_img_tag);
    512 
    513     # get all the <span> nested inside <div class="txt"> elements and
    514     # 1. set their class attr to be "p + page_num + id-of-the-span",
    515     # 2. then delete the id, because the span ids have been reused when element
    516     # ids ought to be unique. Which is why we set the modified ids to be the
    517     # value of the class attribute instead
    518     $dom->find('div.txt span')->each(sub {
    519     $_->attr(class => "p". $page_num. $_->{id});
    520     delete $_->{id};
    521                      }); # both changes done in one find() operation
    522 #$self->_debug_print_html($dom->find('div.txt span')->last);
    523 
    524     # Finally can create our new dom, starting with a div tag for the current page
    525     # Must be: <div id="$page_num" style="position:relative; height:$img_height;"/>
    526 #    my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" )
    527     my $new_dom = Mojo::DOM->new_tag('div', style => "position: relative; height: ".$img_height."px;" );
    528 #$self->_debug_print_html($new_dom);
    529     $new_dom->at('div')->append_content($style_element)->root;
    530 
    531 
    532 #$self->_debug_print_html($new_dom);
    533     # Copy across all the old html's body tag's child nodes into the new dom's new div tag
    534     $dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string
    535 #$self->_debug_print_html($new_dom);
    536 
    537 
    538     # build up the outer div with the <h>tags for sectionalising
    539     my $inner_div_str = $new_dom->to_string;
    540 
    541     my $page_div = "<div id=\"page".$page_num."\">\n";
    542     # Append a page range bucket heading if applicable: if we have more than 10 pages
    543     # to display in the current bucket AND we're on the first page of each bucket of 10 pages.
    544     # Dr Bainbridge thinks for now we need only consider PDFs where the
    545     # total number of pages < 1000 and create buckets of size 10 (e.g. 1-10, ... 51-60, ...)
    546     # If number of remaining pages >= 10, then create new bucket heading
    547     # e.g. "Pages 30-40"
    548     if(($page_num % 10) == 1 && ($num_html_pages - $page_num) > 10) {
    549     # Double-digit page numbers that start with 2
    550     # i.e. 21 to 29 (and 30) should be in 21 to 30 range
    551     my $start_range = $page_num - ($page_num % 10) + 1;
    552     my $end_range = $page_num + 10 - ($page_num % 10);
    553     $page_div .= "<h2 style=\"font-size:1em;font-weight:normal;\">Pages ".$start_range . "-" . $end_range."</h2>\n";
    554     }
    555 
    556     # No sectionalising for 10 pages or under. Otherwise, every page is a section too, not just buckets
    557     if($num_html_pages > 10) {
    558         # Whether we're starting a new bucket or not, add a simpler heading: just the pagenumber, "Page #" 
    559         $page_div .= "<h3 style=\"font-size:1em;font-weight:normal;\">Page ".$page_num."</h3>\n";       
    560     }
    561 
    562     $page_div .= $inner_div_str;
    563     $page_div .= "\n</div>";
    564 
    565     # Finished processing a single html page of the paged_html output generated by
    566     # Xpdf's pdftohtml: finished massaging that single html page into the right form
    567     return $page_div;
    568 }
    569 
    570 # This subroutine is called to do the PDFPlugin post-processing for all cases
    571 # except the "paged_html" conversion mode. This is what PDFPlugin always used to do:
    572 sub default_convert_post_process
    573 {
    574     my $self = shift (@_);
    575     my ($conv_filename) = @_;
    576293    my $outhandle=$self->{'outhandle'};
    577294
Note: See TracChangeset for help on using the changeset viewer.