Changeset 32289 for main

Show
Ignore:
Timestamp:
18.07.2018 21:11:21 (13 months ago)
Author:
ak19
Message:

The PDFPlugin is being deprecated (since PDFv1 and PDFv2 plugins are replacing it). PDFPlugin itself will be around for migrating users, but contained code added to support paged_html/xpdftools before the plugin was refactored into v1 and v2. This commit removes all the xpdftools/paged_html related changes added since revision 31494 to the deprecated PDFPlugin, so that it's back to using just the old pdftohtml tool and the pdf-box extension, since the xpdftools stuff has been moved into PDFv2Plugin. Some other changes since revision 31494 like deprecation messages and use of translation strings instead of hardcoded English language strings remain in PDFPlugin.pm as they're generally relevant.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm

    r32277 r32289  
    3131use ReadTextFile; 
    3232use unicode; 
    33 use Mojo::DOM; # for HTML parsing 
    3433 
    3534use AutoLoadConverters; 
     
    4645      { 'name' => "text", 
    4746    'desc' => "{ConvertBinaryFile.convert_to.text}" }, 
    48       { 'name' => "paged_html", 
    49     'desc' => "{PDFPlugin.convert_to.paged_html}"}, 
    5047      { 'name' => "pagedimg_jpg", 
    5148    'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"}, 
     
    9794       'desc' => "{PDFPlugin.zoom}", 
    9895       'deft' => "2", 
    99 #       'range' => "1,3", # actually the range is 0.5-3  
    100        'type' => "string" }, 
     96       'range' => "1,3", # actually the range is 0.5-3  
     97       'type' => "int" }, 
    10198     { 'name' => "use_sections", 
    10299       'desc' => "{PDFPlugin.use_sections}", 
     
    164161    elsif ($self->{'convert_to'} eq "auto") { 
    165162    # choose html ?? is this the best option 
    166     $self->{'convert_to'} = "paged_html"; 
     163    $self->{'convert_to'} = "html"; 
    167164    } 
    168165    if ($self->{'use_realistic_book'}) { 
     
    215212        push(@$specific_options, "-use_realistic_book"); 
    216213    } 
    217         if($self->{'convert_to'} eq "paged_html") { # for paged html, the default should be to sectionalise on headings the single superpage containing divs representing individual pages as section 
    218             push(@$specific_options, "sectionalise_using_h_tags"); 
    219         } 
    220214    } 
    221215    elsif ($secondary_plugin_name eq "PagedImagePlugin") { 
     
    292286} 
    293287 
    294 # Overriding to do some extra handling for paged_html output mode 
    295 sub run_conversion_command { 
    296     my $self = shift (@_); 
    297     my ($tmp_dirname, $tmp_inputPDFname, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;     
    298  
    299     if($self->{'convert_to'} ne "paged_html") { 
    300     return $self->ConvertBinaryFile::run_conversion_command(@_); 
    301     } 
    302  
    303     # if output mode is paged_html, we use Xpdf tools' pdftohtml and tell it 
    304     # to create a subdir called "pages" in the tmp area to puts its products 
    305     # in there. (Xpdf's pdftohtml needs to be passed a *non-existent* directory 
    306     # parameter, the "pages" subdir). If Xpdf's pdftohtml has successfully run, 
    307     # the intermediary output file tmp/<random-num>/pages/index.html should 
    308     # exist (besides other output products there) 
    309  
    310     # We let ConvertBinaryFile proceed normally, but the return value should reflect 
    311     # that on success it should expect the intermediary product tmpdir/pages/index.html 
    312     # (which is the product of xpdftohtml conversion). 
    313     my $output_filename = $self->ConvertBinaryFile::run_conversion_command(@_); 
    314     $output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "pages", "index.html"); 
    315  
    316     # However, when convert_post_process() is done, it should have output the final 
    317     # product of the paged_html conversion: an html file of the same name and in the 
    318     # same tmp location as the input PDF file. 
    319  
    320     my ($name_prefix, $output_dir, $ext) 
    321     = &File::Basename::fileparse($tmp_inputPDFname, "\\.[^\\.]+\$"); 
    322     $self->{'conv_filename_after_post_process'} = &FileUtils::filenameConcatenate($output_dir, $name_prefix.".html"); 
    323 #    print STDERR "@@@@@ final paged html file will be: " . $self->{'conv_filename_after_post_process'} . "\n"; 
    324  
    325     return $output_filename; 
    326 } 
    327  
    328288sub convert_post_process 
    329289{ 
     
    331291    my ($conv_filename) = @_; 
    332292 
    333     my $outhandle=$self->{'outhandle'}; 
    334  
    335     if($self->{'convert_to'} eq "paged_html") { 
    336     # special post-processing for paged_html mode, as HTML pages generated 
    337     # by xpdf's pdftohtml need to be massaged into the form we want  
    338     $self->xpdftohtml_convert_post_process($conv_filename); 
    339     } 
    340     else { # use PDFPlugin's usual post processing 
    341     $self->default_convert_post_process($conv_filename); 
    342     } 
    343 } 
    344  
    345 # Called after gsConvert.pl has been run to convert a PDF to paged_html 
    346 # using Xpdftools' pdftohtml 
    347 # This method will do some cleanup of the HTML files produced after XPDF has produced 
    348 # an HTML doc for each PDF page: it first gets rid of the default index.html. 
    349 # Instead, it constructs a single html page containing each original HTML page 
    350 # <body> nested as divs instead, with simple section information inserted at the top 
    351 # of each 'page' <div> and some further styling customisation. This HTML manipulation 
    352 # is to be done with the Mojo::DOM perl package. 
    353 # Note that since xpdf's pdftohtml would have failed if the output dir already 
    354 # existed and for simpler naming, the output files are created in a new "pages" 
    355 # subdirectory of the tmp location parent of $conv_filename instead 
    356 sub xpdftohtml_convert_post_process 
    357 { 
    358     my $self = shift (@_); 
    359     my ($pages_index_html) = @_; # = tmp/<rand>/pages/index.html for paged_html output mode 
    360     my $output_filename = $self->{'conv_filename_after_post_process'}; 
    361  
    362     # Read in all the html files in tmp's "pages" subdir, except for index.html. 
    363     # and use it to create a new html file called $self->{'conv_filename_after_post_process'} 
    364     # which will consist of a slightly modified version of 
    365     # each of the other html files concatenated together. 
    366  
    367     my $outhandle=$self->{'outhandle'}; 
    368  
    369     my ($tailname, $pages_subdir, $suffix) 
    370     = &File::Basename::fileparse($pages_index_html, "\\.[^\\.]+\$"); 
    371  
    372     # Code from util::create_itemfile() 
    373     # Read in all the files 
    374     opendir(DIR, $pages_subdir) || die "can't opendir $pages_subdir: $!"; 
    375     my @page_files = grep {-f "$pages_subdir/$_"} readdir(DIR); 
    376     closedir DIR; 
    377     # Sort files in the directory by page_num 
    378     # files are named index.html, page1.html, page2.html, ..., pagen.html 
    379     sub page_number { 
    380     my ($dir) = @_; 
    381     my ($pagenum) =($dir =~ m/^page(\d+)\.html?$/i); 
    382     $pagenum = 0 unless defined $pagenum; # index.html will be given pagenum=0 
    383     return $pagenum; 
    384     } 
    385     # sort the files in the directory in the order of page_num rather than lexically. 
    386     @page_files = sort { page_number($a) <=> page_number($b) } @page_files; 
    387  
    388     #my $num_html_pages = (scalar(@page_files) - 1)/2; # skip index file. 
    389               # For every html file there's an img file, so halve the total num. 
    390               # What about other file types that may potentially be there too??? 
    391     my $num_html_pages = 0; 
    392     foreach my $pagefile (@page_files) { 
    393     $num_html_pages++ if $pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i;  
    394     } 
    395  
    396     # Prepare to create our new html page that will contain all the individual 
    397     # htmls generated by xpdf's pdftohtml in sequence. 
    398     # First write the opening html tags out to the output file. These are the 
    399     # same tags and their contents, including <meta>, as is generated by  
    400     # Xpdf's pdftohtml for each of its individual html pages. 
    401     my $start_text = "<html>\n<head>\n"; 
    402     my ($output_tailname, $tmp_subdir, $html_suffix) 
    403     = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$"); 
    404     $start_text .= "<title>$output_tailname</title>\n"; 
    405     $start_text .= "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n"; 
    406     $start_text .= "</head>\n<body>\n\n"; 
    407     $start_text .= "<h1>$output_tailname</h1>\n\n"; 
    408  
    409     #handle content encodings the same way that default_convert_post_process does 
    410     # $self->utf8_write_file ($start_text, $conv_filename); # will close file after write     
    411     # Don't want to build a giant string in memory of all the pages concatenated 
    412     # and then write it out in one go. Instead, build up the final single page 
    413     # by writing each modified paged_html file out to it as this is processed. 
    414     # Copying file open/close code from CommonUtil::utf8_write_file() 
    415     if (!open (OUTFILE, ">:utf8", $output_filename)) { 
    416     gsprintf(STDERR, "PDFPlugin::xpdftohtml_convert_post_process {CommonUtil.could_not_open_for_writing} ($!)\n", $output_filename); 
    417     die "\n"; 
    418     } 
    419     print OUTFILE $start_text; 
    420  
    421     # Get the contents of each individual HTML page generated by Xpdf, after first 
    422     # modifying each, and write each out into our single all-encompassing html 
    423     foreach my $pagefile (@page_files) { 
    424     if ($pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i) { 
    425         my $page_num = page_number($pagefile);     
    426         # get full path to pagefile 
    427         $pagefile = &FileUtils::filenameConcatenate($pages_subdir, $pagefile); 
    428 #       print STDERR "@@@ About to process html file $pagefile (num $page_num)\n"; 
    429         my $modified_page_contents = $self->_process_paged_html_page($pagefile, $page_num, $num_html_pages); 
    430         print OUTFILE "$modified_page_contents\n\n"; 
    431     } 
    432     } 
    433  
    434     # we've now created a single HTML file by concatenating (a modified version) 
    435     # of each paged html file 
    436     print OUTFILE "</body>\n</html>\n"; # write out closing tags 
    437     close OUTFILE; # done 
    438  
    439     # Get rid of all the htm(l) files incl index.html in the associated "pages" 
    440     # subdir, since we've now processed them all into a single html file 
    441     # one folder level up and we don't want HTMLPlugin to process all of them next. 
    442     &FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); #  no specific whitelist, but blacklist htm(l) 
    443  
    444     # now the tmp area should contain a single html file contain all the html pages' 
    445     # contents in sequence, and a "pages" subdir containing the screenshot images 
    446     # of each page.     
    447     # HTMLPlugin will process these further in the plugin pipeline 
    448 } 
    449  
    450 # For whatever reason, most html <tags> don't get printed out in GLI 
    451 # So when debugging, use this function to print them out as [tags] instead. 
    452 sub _debug_print_html 
    453 { 
    454     my $self = shift (@_); 
    455     my ($string_or_dom) = @_; 
    456  
    457     # can't seem to determine type of string with ref/reftype 
    458     # https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable 
    459     # Not needed, as $dom objects seem to get correctly stringified in string contexts 
    460     # $dom.to_string/$dom.stringify seem to get called, no need to call them 
    461     # https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl 
    462     my $escapedTxt = $string_or_dom;  
    463     $escapedTxt =~ s@\<@[@sg; 
    464     $escapedTxt =~ s@\>@]@sg; 
    465  
    466     print STDERR "#### $escapedTxt\n"; 
    467 } 
    468  
    469 # Helper function to read in each paged_html generated by Xpdf's pdftohtml 
    470 # then modify the html suitably using the HTML parsing functions offered by  
    471 # Mojo::DOM, then return the modified HTML content as a string 
    472 # See https://mojolicious.org/perldoc/Mojo/DOM 
    473 sub _process_paged_html_page 
    474 { 
    475     my $self = shift (@_); 
    476     my ($pagefile, $page_num, $num_html_pages) = @_; 
    477  
    478     my $text = ""; 
    479  
    480     # handling content encoding the same way default_convert_post_process does 
    481     $self->read_file ($pagefile, "utf8", "", \$text); 
    482  
    483     my $dom = Mojo::DOM->new($text); 
    484  
    485 #    $self->_debug_print_html($dom); 
    486  
    487     # there's a <style> element on the <html>, we need to shift it into the <div> 
    488     # tag that we'll be creating. We'll first slightly modify the <style> element 
    489     # store the first style element, which is the only one and in the <body> 
    490     # we'll later insert it as child of an all-encompassing div that we'll create 
    491     my $page_style_tag_str = $dom->at('html')->at('style')->to_string; 
    492     # In the style tag, convert id style references to class style references 
    493     my $css_class = ".p".$page_num."f"; 
    494     $page_style_tag_str =~ s@\#f@$css_class@sg; 
    495     my $style_element = Mojo::DOM->new($page_style_tag_str)->at('style'); # modified     
    496 #$self->_debug_print_html($style_element); 
    497  
    498     # need to know the image's height to set the height of the surrounding 
    499     # div that's to replace this page's <body>: 
    500     my $img_height = $dom->find('img')->[0]{height}; 
    501  
    502     # 2. Adjust the img#background src attribute to point to the pages subdir for imgs 
    503     # 3. Set that img tag's class=background, and change its id to background+$page_num 
    504     my $bg_img_tag=$dom->find('img#background')->[0]; 
    505     my $img_src_str = $bg_img_tag->{src}; 
    506     $img_src_str = "pages/$img_src_str"; 
    507     $bg_img_tag->attr(src => $img_src_str); # reset 
    508 #$self->_debug_print_html($bg_img_tag); 
    509     # set both class and modified id attributes in one step: 
    510     $bg_img_tag->attr({class => "background", id => "background".$page_num}); 
    511 #$self->_debug_print_html($bg_img_tag); 
    512  
    513     # get all the <span> nested inside <div class="txt"> elements and 
    514     # 1. set their class attr to be "p + page_num + id-of-the-span", 
    515     # 2. then delete the id, because the span ids have been reused when element 
    516     # ids ought to be unique. Which is why we set the modified ids to be the 
    517     # value of the class attribute instead 
    518     $dom->find('div.txt span')->each(sub {  
    519     $_->attr(class => "p". $page_num. $_->{id}); 
    520     delete $_->{id}; 
    521                      }); # both changes done in one find() operation 
    522 #$self->_debug_print_html($dom->find('div.txt span')->last); 
    523  
    524     # Finally can create our new dom, starting with a div tag for the current page 
    525     # Must be: <div id="$page_num" style="position:relative; height:$img_height;"/> 
    526 #    my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" ) 
    527     my $new_dom = Mojo::DOM->new_tag('div', style => "position: relative; height: ".$img_height."px;" ); 
    528 #$self->_debug_print_html($new_dom); 
    529     $new_dom->at('div')->append_content($style_element)->root; 
    530  
    531  
    532 #$self->_debug_print_html($new_dom); 
    533     # Copy across all the old html's body tag's child nodes into the new dom's new div tag 
    534     $dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string 
    535 #$self->_debug_print_html($new_dom); 
    536  
    537  
    538     # build up the outer div with the <h>tags for sectionalising 
    539     my $inner_div_str = $new_dom->to_string; 
    540  
    541     my $page_div = "<div id=\"page".$page_num."\">\n"; 
    542     # Append a page range bucket heading if applicable: if we have more than 10 pages 
    543     # to display in the current bucket AND we're on the first page of each bucket of 10 pages. 
    544     # Dr Bainbridge thinks for now we need only consider PDFs where the 
    545     # total number of pages < 1000 and create buckets of size 10 (e.g. 1-10, ... 51-60, ...) 
    546     # If number of remaining pages >= 10, then create new bucket heading 
    547     # e.g. "Pages 30-40" 
    548     if(($page_num % 10) == 1 && ($num_html_pages - $page_num) > 10) { 
    549     # Double-digit page numbers that start with 2 
    550     # i.e. 21 to 29 (and 30) should be in 21 to 30 range 
    551     my $start_range = $page_num - ($page_num % 10) + 1; 
    552     my $end_range = $page_num + 10 - ($page_num % 10); 
    553     $page_div .= "<h2 style=\"font-size:1em;font-weight:normal;\">Pages ".$start_range . "-" . $end_range."</h2>\n"; 
    554     } 
    555  
    556     # No sectionalising for 10 pages or under. Otherwise, every page is a section too, not just buckets 
    557     if($num_html_pages > 10) { 
    558         # Whether we're starting a new bucket or not, add a simpler heading: just the pagenumber, "Page #"   
    559         $page_div .= "<h3 style=\"font-size:1em;font-weight:normal;\">Page ".$page_num."</h3>\n";        
    560     } 
    561  
    562     $page_div .= $inner_div_str; 
    563     $page_div .= "\n</div>"; 
    564  
    565     # Finished processing a single html page of the paged_html output generated by 
    566     # Xpdf's pdftohtml: finished massaging that single html page into the right form 
    567     return $page_div; 
    568 } 
    569  
    570 # This subroutine is called to do the PDFPlugin post-processing for all cases 
    571 # except the "paged_html" conversion mode. This is what PDFPlugin always used to do: 
    572 sub default_convert_post_process 
    573 { 
    574     my $self = shift (@_); 
    575     my ($conv_filename) = @_; 
    576293    my $outhandle=$self->{'outhandle'}; 
    577294