Context Navigation

← Previous Change
Next Change →

greenstone2

Timestamp:

2018-06-22T22:04:16+12:00 (6 years ago)

Author:

ak19

Message:

ConvertBinaryFile.pm no longer knows more than necessary about PDFPlugin's new paged_html output mode; 2. PDFPlugin.pm produces a more correct final html in paged_html output mode: the headings for the current page and page buckets of size 10 (e.g. Pages 21-30) no longer appear behind the generated screenshot of each page on preview. It wasn't a z-index problem, or rather, it was better solved by having an outer div and letting the normal DOM doc flow take care of the rest.

Location:

main/trunk/greenstone2/perllib/plugins

Files:

: 2 edited

ConvertBinaryFile.pm (modified) (2 diffs)
PDFPlugin.pm (modified) (9 diffs)

Legend:

: Unmodified
: Added
: Removed

main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

-              r32205
+              r32206
         $output_filename = $tmp_dirname . "\/$utf8_tailname\/" . $utf8_tailname . ".$output_type";
+    }
-    } elsif ($output_type eq "paged_html") {
-    $output_filename =~ s/$lc_suffix$/.html/;
     } else {
     $output_filename =~ s/$lc_suffix$/.$output_type/;
 …
     if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline
-    # We used to return -1 here if $conv_filename didn't exist at this stage
-    # However, for "paged_html" convert_to mode, the converted HTML file $conv_filename
-    # will only be created from conversion products *after* convert_post_process() returns
     my $output_type=$self->{'convert_to'};
     if ($output_type ne "paged_html" && ! -e "$conv_filename") {return -1;}
+    if (!&FileUtils::fileExists($conv_filename)) {return -1;}
     $self->{'conv_filename'} = $conv_filename;
     $self->convert_post_process($conv_filename);
+    if ($output_type eq "paged_html" && ! -e "$conv_filename") {return -1;}
+    # Check if, after post-processing, the final expected output file has changed
+    # And if it has, check that the final output file now exists after post processing
+    if(defined $self->{'conv_filename_after_post_process'}) {
+    $conv_filename = $self->{'conv_filename_after_post_process'};
+    if (!&FileUtils::fileExists($conv_filename)) {return -1;}
+    }
     # Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file

main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm

-              r32205
+              r32206
+}
+# Overriding to do some extra handling for paged_html output mode
+sub run_conversion_command {
+    my $self = shift (@_);
+    my ($tmp_dirname, $tmp_inputPDFname, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;
+    if($self->{'convert_to'} ne "paged_html") {
+    return $self->ConvertBinaryFile::run_conversion_command(@_);
+    }
+    # if output mode is paged_html, we use Xpdf tools' pdftohtml and tell it
+    # to create a subdir called "pages" in the tmp area to puts its products
+    # in there. (Xpdf's pdftohtml needs to be passed a *non-existent* directory
+    # parameter, the "pages" subdir). If Xpdf's pdftohtml has successfully run,
+    # the intermediary output file tmp/<random-num>/pages/index.html should
+    # exist (besides other output products there)
+    # We let ConvertBinaryFile proceed normally, but the return value should reflect
+    # that on success it should expect the intermediary product tmpdir/pages/index.html
+    # (which is the product of xpdftohtml conversion).
+    my $output_filename = $self->ConvertBinaryFile::run_conversion_command(@_);
+    $output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "pages", "index.html");
+    # However, when convert_post_process() is done, it should have output the final
+    # product of the paged_html conversion: an html file of the same name and in the
+    # same tmp location as the input PDF file.
+    my ($name_prefix, $output_dir, $ext)
+    = &File::Basename::fileparse($tmp_inputPDFname, "\\.[^\\.]+\$");
+    $self->{'conv_filename_after_post_process'} = &FileUtils::filenameConcatenate($output_dir, $name_prefix.".html");
+#    print STDERR "@@@@@ final paged html file will be: " . $self->{'conv_filename_after_post_process'} . "\n";
+    return $output_filename;
+}
 sub convert_post_process
+{
 …
     my $outhandle=$self->{'outhandle'};
+#    print STDERR "@@@ convert_to: ".$self->{'convert_to'}."\n";
+    if($self->{'convert_to'} eq "paged_html") {
+    if($self->{'convert_to'} eq "paged_html") {
     # special post-processing for paged_html mode, as HTML pages generated
     # by xpdf's pdftohtml need to be massaged into the form we want
 …
+{
     my $self = shift (@_);
+    my ($output_filename) = @_; # output_filename = tmp location + filename
+    # if a single html were generated.
+    # We just want the tmp location, append "pages", and read all the html files
+    # in except for index.html. Then we create a new html file by name
+    # $output_filename, which will consist of a slightly modified version of
+    my ($pages_index_html) = @_; # = tmp/<rand>/pages/index.html for paged_html output mode
+    my $output_filename = $self->{'conv_filename_after_post_process'};
+    # Read in all the html files in tmp's "pages" subdir, except for index.html.
+    # and use it to create a new html file called $self->{'conv_filename_after_post_process'}
+    # which will consist of a slightly modified version of
     # each of the other html files concatenated together.
     my $outhandle=$self->{'outhandle'};
+    my ($tailname, $tmp_dir, $suffix)
+    = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$");
+    my $pages_subdir = &FileUtils::filenameConcatenate($tmp_dir, "pages");
+    my ($tailname, $pages_subdir, $suffix)
+    = &File::Basename::fileparse($pages_index_html, "\\.[^\\.]+\$");
     # Code from util::create_itemfile()
 …
     # Xpdf's pdftohtml for each of its individual html pages.
     my $start_text = "<html>\n<head>\n";
+    $start_text .= "<title>$tailname</title>\n";
+    my ($output_tailname, $tmp_subdir, $html_suffix)
+    = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$");
+    $start_text .= "<title>$output_tailname</title>\n";
     $start_text .= "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n";
     $start_text .= "</head>\n<body>\n\n";
 …
     # subdir, since we've now processed them all into a single html file
     # one folder level up and we don't want HTMLPlugin to process all of them next.
-#    my @fullpath_page_files = map { &FileUtils::filenameConcatenate($pages_subdir, $_) } @page_files;
     &FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); #  no specific whitelist, but blacklist htm(l)
 …
     # can't seem to determine type of string with ref/reftype
     # https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable
+    # $dom objects appear to get correctly stringified in string contexts
+    # Not needed, as $dom objects seem to get correctly stringified in string contexts
     # $dom.to_string/$dom.stringify seem to get called, no need to call them
     # https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl
 …
     # store the first style element, which is the only one and in the <body>
     # we'll later insert it as child of an all-encompassing div that we'll create
-#    my $page_style_tag_str = $dom->find('style')->[0]->to_string;
-#    my $page_style_tag_str = $dom->find('html style')->[0]->to_string;
     my $page_style_tag_str = $dom->at('html')->at('style')->to_string;
     # In the style tag, convert id style references to class style references
 …
     my $img_height = $dom->find('img')->[0]{height};
-    # 1. Fix up the style attr on the image by additionally setting z-index=-1 for it
     # 2. Adjust the img#background src attribute to point to the pages subdir for imgs
     # 3. Set that img tag's class=background, and change its id to background+$page_num
     my $bg_img_tag=$dom->find('img#background')->[0];
-    my $img_style_str = $bg_img_tag->{style}; # = $dom->find('img#background')->[0]{style}
-    $img_style_str = $img_style_str." z-index=-1;";
-#print STDERR "img_style_str: " . $img_style_str."\n";
     my $img_src_str = $bg_img_tag->{src};
     $img_src_str = "pages/$img_src_str";
     $bg_img_tag->attr({style => $img_style_str, src => $img_src_str}); # reset
+    $bg_img_tag->attr(src => $img_src_str); # reset
 #$self->_debug_print_html($bg_img_tag);
     # set both class and modified id attributes in one step:
 …
     # Finally can create our new dom, starting with a div tag for the current page
     # Must be: <div id="$page_num" style="position:relative; height:$img_height;"/>
+    my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" );
+#    my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" )
+    my $new_dom = Mojo::DOM->new_tag('div', style => "position: relative; height: ".$img_height."px;" );
 #$self->_debug_print_html($new_dom);
     $new_dom->at('div')->append_content($style_element)->root;
+    # Append a page range bucket heading if applicable
+#$self->_debug_print_html($new_dom);
+    # Copy across all the old html's body tag's child nodes into the new dom's new div tag
+    $dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string
+#$self->_debug_print_html($new_dom);
+    # build up the outer div with the <h>tags for sectionalising
+    my $inner_div_str = $new_dom->to_string;
+    my $page_div = "<div id=\"page".$page_num."\">\n";
+    # Append a page range bucket heading if applicable: if we have more than 10 pages
+    # to display in the current bucket AND we're on the first page of each bucket of 10 pages.
     # Dr Bainbridge thinks for now we need only consider PDFs where the
     # total number of pages < 1000 and create buckets of size 10 (e.g. 1-10, ... 51-60, ...)
     # If number of remaining pages >= 10, then create new bucket heading
     # e.g. "Pages 30-40"
     if(($num_html_pages - $page_num) > 10) {
+    if(($page_num % 10) == 1 && ($num_html_pages - $page_num) > 10) {
     # Double-digit page numbers that start with 2
     # i.e. 21 to 29 (and 30) should be in 21 to 30 range
     my $start_range = $page_num - ($page_num % 10) + 1;
     my $end_range = $page_num + 10 - ($page_num % 10);
+    if($page_num % 10 == 0) { # page 20 however, should be in 11 to 20 range
+        $start_range -= 10;
+        $end_range -= 10;
+    }
+    $new_dom->at('div')->append_content($new_dom->new_tag('h1', "Pages ".$start_range . "-" . $end_range))->root;
+    }
+    # Add a simpler heading: just the pagenumber, "Page #"
+    $page_div .= "<h1 style=\"font-size:1em;font-weight:normal;\">Pages ".$start_range . "-" . $end_range."</h1>\n";
+    }
+    # Whether we're starting a new bucket or not, add a simpler heading: just the pagenumber, "Page #"
+    $page_div .= "<h2 style=\"font-size:1em;font-weight:normal;\">Page ".$page_num."</h2>\n";
     $new_dom->at('div')->append_content($new_dom->new_tag('h2', "Page ".$page_num))->root;
+#$self->_debug_print_html($new_dom);
+    # Copy across all the old html's body tag's child nodes into the new dom's new div tag
+    $dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string
+#$self->_debug_print_html($new_dom);
+    $page_div .= $inner_div_str;
+    $page_div .= "\n</div>";
     # Finished processing a single html page of the paged_html output generated by
     # Xpdf's pdftohtml: finished massaging that single html page into the right form
     return $new_dom->to_string;
+    return $page_div;
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 32206 for main/trunk/greenstone2

Legend:

main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm

Download in other formats: