Changeset 32206

Show
Ignore:
Timestamp:
22.06.2018 22:04:16 (3 months ago)
Author:
ak19
Message:

1. ConvertBinaryFile?.pm no longer knows more than necessary about PDFPlugin's new paged_html output mode; 2. PDFPlugin.pm produces a more correct final html in paged_html output mode: the headings for the current page and page buckets of size 10 (e.g. Pages 21-30) no longer appear behind the generated screenshot of each page on preview. It wasn't a z-index problem, or rather, it was better solved by having an outer div and letting the normal DOM doc flow take care of the rest.

Location:
main/trunk/greenstone2/perllib/plugins
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

    r32205 r32206  
    349349        $output_filename = $tmp_dirname . "\/$utf8_tailname\/" . $utf8_tailname . ".$output_type"; 
    350350    } 
    351     } elsif ($output_type eq "paged_html") { 
    352     $output_filename =~ s/$lc_suffix$/.html/; 
    353351    } else { 
    354352    $output_filename =~ s/$lc_suffix$/.$output_type/; 
     
    374372    if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline  
    375373 
    376     # We used to return -1 here if $conv_filename didn't exist at this stage 
    377     # However, for "paged_html" convert_to mode, the converted HTML file $conv_filename  
    378     # will only be created from conversion products *after* convert_post_process() returns 
    379374    my $output_type=$self->{'convert_to'}; 
    380     if ($output_type ne "paged_html" && ! -e "$conv_filename") {return -1;}   
     375    if (!&FileUtils::fileExists($conv_filename)) {return -1;} 
    381376    $self->{'conv_filename'} = $conv_filename; 
    382377    $self->convert_post_process($conv_filename); 
    383     if ($output_type eq "paged_html" && ! -e "$conv_filename") {return -1;}   
     378 
     379    # Check if, after post-processing, the final expected output file has changed 
     380    # And if it has, check that the final output file now exists after post processing 
     381    if(defined $self->{'conv_filename_after_post_process'}) {    
     382    $conv_filename = $self->{'conv_filename_after_post_process'}; 
     383    if (!&FileUtils::fileExists($conv_filename)) {return -1;} 
     384    } 
    384385 
    385386    # Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file 
  • main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm

    r32205 r32206  
    281281} 
    282282 
     283# Overriding to do some extra handling for paged_html output mode 
     284sub run_conversion_command { 
     285    my $self = shift (@_); 
     286    my ($tmp_dirname, $tmp_inputPDFname, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;     
     287 
     288    if($self->{'convert_to'} ne "paged_html") { 
     289    return $self->ConvertBinaryFile::run_conversion_command(@_); 
     290    } 
     291 
     292    # if output mode is paged_html, we use Xpdf tools' pdftohtml and tell it 
     293    # to create a subdir called "pages" in the tmp area to puts its products 
     294    # in there. (Xpdf's pdftohtml needs to be passed a *non-existent* directory 
     295    # parameter, the "pages" subdir). If Xpdf's pdftohtml has successfully run, 
     296    # the intermediary output file tmp/<random-num>/pages/index.html should 
     297    # exist (besides other output products there) 
     298 
     299    # We let ConvertBinaryFile proceed normally, but the return value should reflect 
     300    # that on success it should expect the intermediary product tmpdir/pages/index.html 
     301    # (which is the product of xpdftohtml conversion). 
     302    my $output_filename = $self->ConvertBinaryFile::run_conversion_command(@_); 
     303    $output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "pages", "index.html"); 
     304 
     305    # However, when convert_post_process() is done, it should have output the final 
     306    # product of the paged_html conversion: an html file of the same name and in the 
     307    # same tmp location as the input PDF file. 
     308 
     309    my ($name_prefix, $output_dir, $ext) 
     310    = &File::Basename::fileparse($tmp_inputPDFname, "\\.[^\\.]+\$"); 
     311    $self->{'conv_filename_after_post_process'} = &FileUtils::filenameConcatenate($output_dir, $name_prefix.".html"); 
     312#    print STDERR "@@@@@ final paged html file will be: " . $self->{'conv_filename_after_post_process'} . "\n"; 
     313 
     314    return $output_filename; 
     315} 
     316 
    283317sub convert_post_process 
    284318{ 
     
    287321 
    288322    my $outhandle=$self->{'outhandle'}; 
    289 #    print STDERR "@@@ convert_to: ".$self->{'convert_to'}."\n"; 
    290  
    291     if($self->{'convert_to'} eq "paged_html") {  
     323 
     324    if($self->{'convert_to'} eq "paged_html") { 
    292325    # special post-processing for paged_html mode, as HTML pages generated 
    293326    # by xpdf's pdftohtml need to be massaged into the form we want  
     
    313346{ 
    314347    my $self = shift (@_); 
    315     my ($output_filename) = @_; # output_filename = tmp location + filename  
    316     # if a single html were generated. 
    317     # We just want the tmp location, append "pages", and read all the html files 
    318     # in except for index.html. Then we create a new html file by name 
    319     # $output_filename, which will consist of a slightly modified version of 
     348    my ($pages_index_html) = @_; # = tmp/<rand>/pages/index.html for paged_html output mode 
     349    my $output_filename = $self->{'conv_filename_after_post_process'}; 
     350 
     351    # Read in all the html files in tmp's "pages" subdir, except for index.html. 
     352    # and use it to create a new html file called $self->{'conv_filename_after_post_process'} 
     353    # which will consist of a slightly modified version of 
    320354    # each of the other html files concatenated together. 
    321355 
    322356    my $outhandle=$self->{'outhandle'}; 
    323357 
    324     my ($tailname, $tmp_dir, $suffix) 
    325     = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$"); 
    326     my $pages_subdir = &FileUtils::filenameConcatenate($tmp_dir, "pages"); 
     358    my ($tailname, $pages_subdir, $suffix) 
     359    = &File::Basename::fileparse($pages_index_html, "\\.[^\\.]+\$"); 
    327360 
    328361    # Code from util::create_itemfile() 
     
    356389    # Xpdf's pdftohtml for each of its individual html pages. 
    357390    my $start_text = "<html>\n<head>\n"; 
    358     $start_text .= "<title>$tailname</title>\n"; 
     391    my ($output_tailname, $tmp_subdir, $html_suffix) 
     392    = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$"); 
     393    $start_text .= "<title>$output_tailname</title>\n"; 
    359394    $start_text .= "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n"; 
    360395    $start_text .= "</head>\n<body>\n\n"; 
     
    393428    # subdir, since we've now processed them all into a single html file 
    394429    # one folder level up and we don't want HTMLPlugin to process all of them next. 
    395 #    my @fullpath_page_files = map { &FileUtils::filenameConcatenate($pages_subdir, $_) } @page_files; 
    396430    &FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); #  no specific whitelist, but blacklist htm(l) 
    397431 
     
    411445    # can't seem to determine type of string with ref/reftype 
    412446    # https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable 
    413  
    414     # $dom objects appear to get correctly stringified in string contexts 
     447    # Not needed, as $dom objects seem to get correctly stringified in string contexts 
    415448    # $dom.to_string/$dom.stringify seem to get called, no need to call them 
    416449    # https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl 
     
    444477    # store the first style element, which is the only one and in the <body> 
    445478    # we'll later insert it as child of an all-encompassing div that we'll create 
    446 #    my $page_style_tag_str = $dom->find('style')->[0]->to_string; 
    447 #    my $page_style_tag_str = $dom->find('html style')->[0]->to_string; 
    448479    my $page_style_tag_str = $dom->at('html')->at('style')->to_string; 
    449480    # In the style tag, convert id style references to class style references 
     
    457488    my $img_height = $dom->find('img')->[0]{height}; 
    458489 
    459  
    460     # 1. Fix up the style attr on the image by additionally setting z-index=-1 for it 
    461490    # 2. Adjust the img#background src attribute to point to the pages subdir for imgs 
    462491    # 3. Set that img tag's class=background, and change its id to background+$page_num 
    463492    my $bg_img_tag=$dom->find('img#background')->[0]; 
    464  
    465     my $img_style_str = $bg_img_tag->{style}; # = $dom->find('img#background')->[0]{style} 
    466     $img_style_str = $img_style_str." z-index=-1;"; 
    467 #print STDERR "img_style_str: " . $img_style_str."\n"; 
    468493    my $img_src_str = $bg_img_tag->{src}; 
    469494    $img_src_str = "pages/$img_src_str"; 
    470     $bg_img_tag->attr({style => $img_style_str, src => $img_src_str}); # reset 
     495    $bg_img_tag->attr(src => $img_src_str); # reset 
    471496#$self->_debug_print_html($bg_img_tag); 
    472497    # set both class and modified id attributes in one step: 
     
    487512    # Finally can create our new dom, starting with a div tag for the current page 
    488513    # Must be: <div id="$page_num" style="position:relative; height:$img_height;"/> 
    489     my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" ); 
     514#    my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" ) 
     515    my $new_dom = Mojo::DOM->new_tag('div', style => "position: relative; height: ".$img_height."px;" ); 
    490516#$self->_debug_print_html($new_dom); 
    491517    $new_dom->at('div')->append_content($style_element)->root; 
    492518 
    493     # Append a page range bucket heading if applicable 
     519 
     520#$self->_debug_print_html($new_dom); 
     521    # Copy across all the old html's body tag's child nodes into the new dom's new div tag 
     522    $dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string 
     523#$self->_debug_print_html($new_dom); 
     524 
     525 
     526    # build up the outer div with the <h>tags for sectionalising 
     527    my $inner_div_str = $new_dom->to_string; 
     528 
     529    my $page_div = "<div id=\"page".$page_num."\">\n"; 
     530    # Append a page range bucket heading if applicable: if we have more than 10 pages 
     531    # to display in the current bucket AND we're on the first page of each bucket of 10 pages. 
    494532    # Dr Bainbridge thinks for now we need only consider PDFs where the 
    495533    # total number of pages < 1000 and create buckets of size 10 (e.g. 1-10, ... 51-60, ...) 
    496534    # If number of remaining pages >= 10, then create new bucket heading 
    497535    # e.g. "Pages 30-40" 
    498     if(($num_html_pages - $page_num) > 10) { 
     536    if(($page_num % 10) == 1 && ($num_html_pages - $page_num) > 10) { 
    499537    # Double-digit page numbers that start with 2 
    500538    # i.e. 21 to 29 (and 30) should be in 21 to 30 range 
    501539    my $start_range = $page_num - ($page_num % 10) + 1; 
    502540    my $end_range = $page_num + 10 - ($page_num % 10); 
    503     if($page_num % 10 == 0) { # page 20 however, should be in 11 to 20 range 
    504         $start_range -= 10; 
    505         $end_range -= 10; 
    506     } 
    507     $new_dom->at('div')->append_content($new_dom->new_tag('h1', "Pages ".$start_range . "-" . $end_range))->root; 
    508     } 
    509  
    510     # Add a simpler heading: just the pagenumber, "Page #" 
     541    $page_div .= "<h1 style=\"font-size:1em;font-weight:normal;\">Pages ".$start_range . "-" . $end_range."</h1>\n"; 
     542    } 
     543 
     544    # Whether we're starting a new bucket or not, add a simpler heading: just the pagenumber, "Page #" 
     545    $page_div .= "<h2 style=\"font-size:1em;font-weight:normal;\">Page ".$page_num."</h2>\n"; 
    511546    $new_dom->at('div')->append_content($new_dom->new_tag('h2', "Page ".$page_num))->root; 
    512 #$self->_debug_print_html($new_dom); 
    513     # Copy across all the old html's body tag's child nodes into the new dom's new div tag 
    514     $dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string 
    515 #$self->_debug_print_html($new_dom); 
     547 
     548    $page_div .= $inner_div_str; 
     549    $page_div .= "\n</div>"; 
    516550 
    517551    # Finished processing a single html page of the paged_html output generated by 
    518552    # Xpdf's pdftohtml: finished massaging that single html page into the right form 
    519     return $new_dom->to_string; 
     553    return $page_div; 
    520554} 
    521555