Ignore:
Timestamp:
2018-06-22T22:04:16+12:00 (6 years ago)
Author:
ak19
Message:
  1. ConvertBinaryFile.pm no longer knows more than necessary about PDFPlugin's new paged_html output mode; 2. PDFPlugin.pm produces a more correct final html in paged_html output mode: the headings for the current page and page buckets of size 10 (e.g. Pages 21-30) no longer appear behind the generated screenshot of each page on preview. It wasn't a z-index problem, or rather, it was better solved by having an outer div and letting the normal DOM doc flow take care of the rest.
Location:
main/trunk/greenstone2/perllib/plugins
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

    r32205 r32206  
    349349        $output_filename = $tmp_dirname . "\/$utf8_tailname\/" . $utf8_tailname . ".$output_type";
    350350    }
    351     } elsif ($output_type eq "paged_html") {
    352     $output_filename =~ s/$lc_suffix$/.html/;
    353351    } else {
    354352    $output_filename =~ s/$lc_suffix$/.$output_type/;
     
    374372    if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
    375373
    376     # We used to return -1 here if $conv_filename didn't exist at this stage
    377     # However, for "paged_html" convert_to mode, the converted HTML file $conv_filename
    378     # will only be created from conversion products *after* convert_post_process() returns
    379374    my $output_type=$self->{'convert_to'};
    380     if ($output_type ne "paged_html" && ! -e "$conv_filename") {return -1;} 
     375    if (!&FileUtils::fileExists($conv_filename)) {return -1;}
    381376    $self->{'conv_filename'} = $conv_filename;
    382377    $self->convert_post_process($conv_filename);
    383     if ($output_type eq "paged_html" && ! -e "$conv_filename") {return -1;} 
     378
     379    # Check if, after post-processing, the final expected output file has changed
     380    # And if it has, check that the final output file now exists after post processing
     381    if(defined $self->{'conv_filename_after_post_process'}) {   
     382    $conv_filename = $self->{'conv_filename_after_post_process'};
     383    if (!&FileUtils::fileExists($conv_filename)) {return -1;}
     384    }
    384385
    385386    # Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file
  • main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm

    r32205 r32206  
    281281}
    282282
     283# Overriding to do some extra handling for paged_html output mode
     284sub run_conversion_command {
     285    my $self = shift (@_);
     286    my ($tmp_dirname, $tmp_inputPDFname, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;   
     287
     288    if($self->{'convert_to'} ne "paged_html") {
     289    return $self->ConvertBinaryFile::run_conversion_command(@_);
     290    }
     291
     292    # if output mode is paged_html, we use Xpdf tools' pdftohtml and tell it
     293    # to create a subdir called "pages" in the tmp area to puts its products
     294    # in there. (Xpdf's pdftohtml needs to be passed a *non-existent* directory
     295    # parameter, the "pages" subdir). If Xpdf's pdftohtml has successfully run,
     296    # the intermediary output file tmp/<random-num>/pages/index.html should
     297    # exist (besides other output products there)
     298
     299    # We let ConvertBinaryFile proceed normally, but the return value should reflect
     300    # that on success it should expect the intermediary product tmpdir/pages/index.html
     301    # (which is the product of xpdftohtml conversion).
     302    my $output_filename = $self->ConvertBinaryFile::run_conversion_command(@_);
     303    $output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "pages", "index.html");
     304
     305    # However, when convert_post_process() is done, it should have output the final
     306    # product of the paged_html conversion: an html file of the same name and in the
     307    # same tmp location as the input PDF file.
     308
     309    my ($name_prefix, $output_dir, $ext)
     310    = &File::Basename::fileparse($tmp_inputPDFname, "\\.[^\\.]+\$");
     311    $self->{'conv_filename_after_post_process'} = &FileUtils::filenameConcatenate($output_dir, $name_prefix.".html");
     312#    print STDERR "@@@@@ final paged html file will be: " . $self->{'conv_filename_after_post_process'} . "\n";
     313
     314    return $output_filename;
     315}
     316
    283317sub convert_post_process
    284318{
     
    287321
    288322    my $outhandle=$self->{'outhandle'};
    289 #    print STDERR "@@@ convert_to: ".$self->{'convert_to'}."\n";
    290 
    291     if($self->{'convert_to'} eq "paged_html") {
     323
     324    if($self->{'convert_to'} eq "paged_html") {
    292325    # special post-processing for paged_html mode, as HTML pages generated
    293326    # by xpdf's pdftohtml need to be massaged into the form we want
     
    313346{
    314347    my $self = shift (@_);
    315     my ($output_filename) = @_; # output_filename = tmp location + filename
    316     # if a single html were generated.
    317     # We just want the tmp location, append "pages", and read all the html files
    318     # in except for index.html. Then we create a new html file by name
    319     # $output_filename, which will consist of a slightly modified version of
     348    my ($pages_index_html) = @_; # = tmp/<rand>/pages/index.html for paged_html output mode
     349    my $output_filename = $self->{'conv_filename_after_post_process'};
     350
     351    # Read in all the html files in tmp's "pages" subdir, except for index.html.
     352    # and use it to create a new html file called $self->{'conv_filename_after_post_process'}
     353    # which will consist of a slightly modified version of
    320354    # each of the other html files concatenated together.
    321355
    322356    my $outhandle=$self->{'outhandle'};
    323357
    324     my ($tailname, $tmp_dir, $suffix)
    325     = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$");
    326     my $pages_subdir = &FileUtils::filenameConcatenate($tmp_dir, "pages");
     358    my ($tailname, $pages_subdir, $suffix)
     359    = &File::Basename::fileparse($pages_index_html, "\\.[^\\.]+\$");
    327360
    328361    # Code from util::create_itemfile()
     
    356389    # Xpdf's pdftohtml for each of its individual html pages.
    357390    my $start_text = "<html>\n<head>\n";
    358     $start_text .= "<title>$tailname</title>\n";
     391    my ($output_tailname, $tmp_subdir, $html_suffix)
     392    = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$");
     393    $start_text .= "<title>$output_tailname</title>\n";
    359394    $start_text .= "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n";
    360395    $start_text .= "</head>\n<body>\n\n";
     
    393428    # subdir, since we've now processed them all into a single html file
    394429    # one folder level up and we don't want HTMLPlugin to process all of them next.
    395 #    my @fullpath_page_files = map { &FileUtils::filenameConcatenate($pages_subdir, $_) } @page_files;
    396430    &FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); #  no specific whitelist, but blacklist htm(l)
    397431
     
    411445    # can't seem to determine type of string with ref/reftype
    412446    # https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable
    413 
    414     # $dom objects appear to get correctly stringified in string contexts
     447    # Not needed, as $dom objects seem to get correctly stringified in string contexts
    415448    # $dom.to_string/$dom.stringify seem to get called, no need to call them
    416449    # https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl
     
    444477    # store the first style element, which is the only one and in the <body>
    445478    # we'll later insert it as child of an all-encompassing div that we'll create
    446 #    my $page_style_tag_str = $dom->find('style')->[0]->to_string;
    447 #    my $page_style_tag_str = $dom->find('html style')->[0]->to_string;
    448479    my $page_style_tag_str = $dom->at('html')->at('style')->to_string;
    449480    # In the style tag, convert id style references to class style references
     
    457488    my $img_height = $dom->find('img')->[0]{height};
    458489
    459 
    460     # 1. Fix up the style attr on the image by additionally setting z-index=-1 for it
    461490    # 2. Adjust the img#background src attribute to point to the pages subdir for imgs
    462491    # 3. Set that img tag's class=background, and change its id to background+$page_num
    463492    my $bg_img_tag=$dom->find('img#background')->[0];
    464 
    465     my $img_style_str = $bg_img_tag->{style}; # = $dom->find('img#background')->[0]{style}
    466     $img_style_str = $img_style_str." z-index=-1;";
    467 #print STDERR "img_style_str: " . $img_style_str."\n";
    468493    my $img_src_str = $bg_img_tag->{src};
    469494    $img_src_str = "pages/$img_src_str";
    470     $bg_img_tag->attr({style => $img_style_str, src => $img_src_str}); # reset
     495    $bg_img_tag->attr(src => $img_src_str); # reset
    471496#$self->_debug_print_html($bg_img_tag);
    472497    # set both class and modified id attributes in one step:
     
    487512    # Finally can create our new dom, starting with a div tag for the current page
    488513    # Must be: <div id="$page_num" style="position:relative; height:$img_height;"/>
    489     my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" );
     514#    my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" )
     515    my $new_dom = Mojo::DOM->new_tag('div', style => "position: relative; height: ".$img_height."px;" );
    490516#$self->_debug_print_html($new_dom);
    491517    $new_dom->at('div')->append_content($style_element)->root;
    492518
    493     # Append a page range bucket heading if applicable
     519
     520#$self->_debug_print_html($new_dom);
     521    # Copy across all the old html's body tag's child nodes into the new dom's new div tag
     522    $dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string
     523#$self->_debug_print_html($new_dom);
     524
     525
     526    # build up the outer div with the <h>tags for sectionalising
     527    my $inner_div_str = $new_dom->to_string;
     528
     529    my $page_div = "<div id=\"page".$page_num."\">\n";
     530    # Append a page range bucket heading if applicable: if we have more than 10 pages
     531    # to display in the current bucket AND we're on the first page of each bucket of 10 pages.
    494532    # Dr Bainbridge thinks for now we need only consider PDFs where the
    495533    # total number of pages < 1000 and create buckets of size 10 (e.g. 1-10, ... 51-60, ...)
    496534    # If number of remaining pages >= 10, then create new bucket heading
    497535    # e.g. "Pages 30-40"
    498     if(($num_html_pages - $page_num) > 10) {
     536    if(($page_num % 10) == 1 && ($num_html_pages - $page_num) > 10) {
    499537    # Double-digit page numbers that start with 2
    500538    # i.e. 21 to 29 (and 30) should be in 21 to 30 range
    501539    my $start_range = $page_num - ($page_num % 10) + 1;
    502540    my $end_range = $page_num + 10 - ($page_num % 10);
    503     if($page_num % 10 == 0) { # page 20 however, should be in 11 to 20 range
    504         $start_range -= 10;
    505         $end_range -= 10;
    506     }
    507     $new_dom->at('div')->append_content($new_dom->new_tag('h1', "Pages ".$start_range . "-" . $end_range))->root;
    508     }
    509 
    510     # Add a simpler heading: just the pagenumber, "Page #"
     541    $page_div .= "<h1 style=\"font-size:1em;font-weight:normal;\">Pages ".$start_range . "-" . $end_range."</h1>\n";
     542    }
     543
     544    # Whether we're starting a new bucket or not, add a simpler heading: just the pagenumber, "Page #"
     545    $page_div .= "<h2 style=\"font-size:1em;font-weight:normal;\">Page ".$page_num."</h2>\n";
    511546    $new_dom->at('div')->append_content($new_dom->new_tag('h2', "Page ".$page_num))->root;
    512 #$self->_debug_print_html($new_dom);
    513     # Copy across all the old html's body tag's child nodes into the new dom's new div tag
    514     $dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string
    515 #$self->_debug_print_html($new_dom);
     547
     548    $page_div .= $inner_div_str;
     549    $page_div .= "\n</div>";
    516550
    517551    # Finished processing a single html page of the paged_html output generated by
    518552    # Xpdf's pdftohtml: finished massaging that single html page into the right form
    519     return $new_dom->to_string;
     553    return $page_div;
    520554}
    521555
Note: See TracChangeset for help on using the changeset viewer.