Changeset 32289 for main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm
- Timestamp:
- 2018-07-18T21:11:21+12:00 (6 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm
r32277 r32289 31 31 use ReadTextFile; 32 32 use unicode; 33 use Mojo::DOM; # for HTML parsing34 33 35 34 use AutoLoadConverters; … … 46 45 { 'name' => "text", 47 46 'desc' => "{ConvertBinaryFile.convert_to.text}" }, 48 { 'name' => "paged_html",49 'desc' => "{PDFPlugin.convert_to.paged_html}"},50 47 { 'name' => "pagedimg_jpg", 51 48 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"}, … … 97 94 'desc' => "{PDFPlugin.zoom}", 98 95 'deft' => "2", 99 #'range' => "1,3", # actually the range is 0.5-3100 'type' => " string" },96 'range' => "1,3", # actually the range is 0.5-3 97 'type' => "int" }, 101 98 { 'name' => "use_sections", 102 99 'desc' => "{PDFPlugin.use_sections}", … … 164 161 elsif ($self->{'convert_to'} eq "auto") { 165 162 # choose html ?? is this the best option 166 $self->{'convert_to'} = " paged_html";163 $self->{'convert_to'} = "html"; 167 164 } 168 165 if ($self->{'use_realistic_book'}) { … … 215 212 push(@$specific_options, "-use_realistic_book"); 216 213 } 217 if($self->{'convert_to'} eq "paged_html") { # for paged html, the default should be to sectionalise on headings the single superpage containing divs representing individual pages as section218 push(@$specific_options, "sectionalise_using_h_tags");219 }220 214 } 221 215 elsif ($secondary_plugin_name eq "PagedImagePlugin") { … … 292 286 } 293 287 294 # Overriding to do some extra handling for paged_html output mode295 sub run_conversion_command {296 my $self = shift (@_);297 my ($tmp_dirname, $tmp_inputPDFname, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;298 299 if($self->{'convert_to'} ne "paged_html") {300 return $self->ConvertBinaryFile::run_conversion_command(@_);301 }302 303 # if output mode is paged_html, we use Xpdf tools' pdftohtml and tell it304 # to create a subdir called "pages" in the tmp area to puts its products305 # in there. (Xpdf's pdftohtml needs to be passed a *non-existent* directory306 # parameter, the "pages" subdir). If Xpdf's pdftohtml has successfully run,307 # the intermediary output file tmp/<random-num>/pages/index.html should308 # exist (besides other output products there)309 310 # We let ConvertBinaryFile proceed normally, but the return value should reflect311 # that on success it should expect the intermediary product tmpdir/pages/index.html312 # (which is the product of xpdftohtml conversion).313 my $output_filename = $self->ConvertBinaryFile::run_conversion_command(@_);314 $output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "pages", "index.html");315 316 # However, when convert_post_process() is done, it should have output the final317 # product of the paged_html conversion: an html file of the same name and in the318 # same tmp location as the input PDF file.319 320 my ($name_prefix, $output_dir, $ext)321 = &File::Basename::fileparse($tmp_inputPDFname, "\\.[^\\.]+\$");322 $self->{'conv_filename_after_post_process'} = &FileUtils::filenameConcatenate($output_dir, $name_prefix.".html");323 # print STDERR "@@@@@ final paged html file will be: " . $self->{'conv_filename_after_post_process'} . "\n";324 325 return $output_filename;326 }327 328 288 sub convert_post_process 329 289 { … … 331 291 my ($conv_filename) = @_; 332 292 333 my $outhandle=$self->{'outhandle'};334 335 if($self->{'convert_to'} eq "paged_html") {336 # special post-processing for paged_html mode, as HTML pages generated337 # by xpdf's pdftohtml need to be massaged into the form we want338 $self->xpdftohtml_convert_post_process($conv_filename);339 }340 else { # use PDFPlugin's usual post processing341 $self->default_convert_post_process($conv_filename);342 }343 }344 345 # Called after gsConvert.pl has been run to convert a PDF to paged_html346 # using Xpdftools' pdftohtml347 # This method will do some cleanup of the HTML files produced after XPDF has produced348 # an HTML doc for each PDF page: it first gets rid of the default index.html.349 # Instead, it constructs a single html page containing each original HTML page350 # <body> nested as divs instead, with simple section information inserted at the top351 # of each 'page' <div> and some further styling customisation. This HTML manipulation352 # is to be done with the Mojo::DOM perl package.353 # Note that since xpdf's pdftohtml would have failed if the output dir already354 # existed and for simpler naming, the output files are created in a new "pages"355 # subdirectory of the tmp location parent of $conv_filename instead356 sub xpdftohtml_convert_post_process357 {358 my $self = shift (@_);359 my ($pages_index_html) = @_; # = tmp/<rand>/pages/index.html for paged_html output mode360 my $output_filename = $self->{'conv_filename_after_post_process'};361 362 # Read in all the html files in tmp's "pages" subdir, except for index.html.363 # and use it to create a new html file called $self->{'conv_filename_after_post_process'}364 # which will consist of a slightly modified version of365 # each of the other html files concatenated together.366 367 my $outhandle=$self->{'outhandle'};368 369 my ($tailname, $pages_subdir, $suffix)370 = &File::Basename::fileparse($pages_index_html, "\\.[^\\.]+\$");371 372 # Code from util::create_itemfile()373 # Read in all the files374 opendir(DIR, $pages_subdir) || die "can't opendir $pages_subdir: $!";375 my @page_files = grep {-f "$pages_subdir/$_"} readdir(DIR);376 closedir DIR;377 # Sort files in the directory by page_num378 # files are named index.html, page1.html, page2.html, ..., pagen.html379 sub page_number {380 my ($dir) = @_;381 my ($pagenum) =($dir =~ m/^page(\d+)\.html?$/i);382 $pagenum = 0 unless defined $pagenum; # index.html will be given pagenum=0383 return $pagenum;384 }385 # sort the files in the directory in the order of page_num rather than lexically.386 @page_files = sort { page_number($a) <=> page_number($b) } @page_files;387 388 #my $num_html_pages = (scalar(@page_files) - 1)/2; # skip index file.389 # For every html file there's an img file, so halve the total num.390 # What about other file types that may potentially be there too???391 my $num_html_pages = 0;392 foreach my $pagefile (@page_files) {393 $num_html_pages++ if $pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i;394 }395 396 # Prepare to create our new html page that will contain all the individual397 # htmls generated by xpdf's pdftohtml in sequence.398 # First write the opening html tags out to the output file. These are the399 # same tags and their contents, including <meta>, as is generated by400 # Xpdf's pdftohtml for each of its individual html pages.401 my $start_text = "<html>\n<head>\n";402 my ($output_tailname, $tmp_subdir, $html_suffix)403 = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$");404 $start_text .= "<title>$output_tailname</title>\n";405 $start_text .= "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n";406 $start_text .= "</head>\n<body>\n\n";407 $start_text .= "<h1>$output_tailname</h1>\n\n";408 409 #handle content encodings the same way that default_convert_post_process does410 # $self->utf8_write_file ($start_text, $conv_filename); # will close file after write411 # Don't want to build a giant string in memory of all the pages concatenated412 # and then write it out in one go. Instead, build up the final single page413 # by writing each modified paged_html file out to it as this is processed.414 # Copying file open/close code from CommonUtil::utf8_write_file()415 if (!open (OUTFILE, ">:utf8", $output_filename)) {416 gsprintf(STDERR, "PDFPlugin::xpdftohtml_convert_post_process {CommonUtil.could_not_open_for_writing} ($!)\n", $output_filename);417 die "\n";418 }419 print OUTFILE $start_text;420 421 # Get the contents of each individual HTML page generated by Xpdf, after first422 # modifying each, and write each out into our single all-encompassing html423 foreach my $pagefile (@page_files) {424 if ($pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i) {425 my $page_num = page_number($pagefile);426 # get full path to pagefile427 $pagefile = &FileUtils::filenameConcatenate($pages_subdir, $pagefile);428 # print STDERR "@@@ About to process html file $pagefile (num $page_num)\n";429 my $modified_page_contents = $self->_process_paged_html_page($pagefile, $page_num, $num_html_pages);430 print OUTFILE "$modified_page_contents\n\n";431 }432 }433 434 # we've now created a single HTML file by concatenating (a modified version)435 # of each paged html file436 print OUTFILE "</body>\n</html>\n"; # write out closing tags437 close OUTFILE; # done438 439 # Get rid of all the htm(l) files incl index.html in the associated "pages"440 # subdir, since we've now processed them all into a single html file441 # one folder level up and we don't want HTMLPlugin to process all of them next.442 &FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); # no specific whitelist, but blacklist htm(l)443 444 # now the tmp area should contain a single html file contain all the html pages'445 # contents in sequence, and a "pages" subdir containing the screenshot images446 # of each page.447 # HTMLPlugin will process these further in the plugin pipeline448 }449 450 # For whatever reason, most html <tags> don't get printed out in GLI451 # So when debugging, use this function to print them out as [tags] instead.452 sub _debug_print_html453 {454 my $self = shift (@_);455 my ($string_or_dom) = @_;456 457 # can't seem to determine type of string with ref/reftype458 # https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable459 # Not needed, as $dom objects seem to get correctly stringified in string contexts460 # $dom.to_string/$dom.stringify seem to get called, no need to call them461 # https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl462 my $escapedTxt = $string_or_dom;463 $escapedTxt =~ s@\<@[@sg;464 $escapedTxt =~ s@\>@]@sg;465 466 print STDERR "#### $escapedTxt\n";467 }468 469 # Helper function to read in each paged_html generated by Xpdf's pdftohtml470 # then modify the html suitably using the HTML parsing functions offered by471 # Mojo::DOM, then return the modified HTML content as a string472 # See https://mojolicious.org/perldoc/Mojo/DOM473 sub _process_paged_html_page474 {475 my $self = shift (@_);476 my ($pagefile, $page_num, $num_html_pages) = @_;477 478 my $text = "";479 480 # handling content encoding the same way default_convert_post_process does481 $self->read_file ($pagefile, "utf8", "", \$text);482 483 my $dom = Mojo::DOM->new($text);484 485 # $self->_debug_print_html($dom);486 487 # there's a <style> element on the <html>, we need to shift it into the <div>488 # tag that we'll be creating. We'll first slightly modify the <style> element489 # store the first style element, which is the only one and in the <body>490 # we'll later insert it as child of an all-encompassing div that we'll create491 my $page_style_tag_str = $dom->at('html')->at('style')->to_string;492 # In the style tag, convert id style references to class style references493 my $css_class = ".p".$page_num."f";494 $page_style_tag_str =~ s@\#f@$css_class@sg;495 my $style_element = Mojo::DOM->new($page_style_tag_str)->at('style'); # modified496 #$self->_debug_print_html($style_element);497 498 # need to know the image's height to set the height of the surrounding499 # div that's to replace this page's <body>:500 my $img_height = $dom->find('img')->[0]{height};501 502 # 2. Adjust the img#background src attribute to point to the pages subdir for imgs503 # 3. Set that img tag's class=background, and change its id to background+$page_num504 my $bg_img_tag=$dom->find('img#background')->[0];505 my $img_src_str = $bg_img_tag->{src};506 $img_src_str = "pages/$img_src_str";507 $bg_img_tag->attr(src => $img_src_str); # reset508 #$self->_debug_print_html($bg_img_tag);509 # set both class and modified id attributes in one step:510 $bg_img_tag->attr({class => "background", id => "background".$page_num});511 #$self->_debug_print_html($bg_img_tag);512 513 # get all the <span> nested inside <div class="txt"> elements and514 # 1. set their class attr to be "p + page_num + id-of-the-span",515 # 2. then delete the id, because the span ids have been reused when element516 # ids ought to be unique. Which is why we set the modified ids to be the517 # value of the class attribute instead518 $dom->find('div.txt span')->each(sub {519 $_->attr(class => "p". $page_num. $_->{id});520 delete $_->{id};521 }); # both changes done in one find() operation522 #$self->_debug_print_html($dom->find('div.txt span')->last);523 524 # Finally can create our new dom, starting with a div tag for the current page525 # Must be: <div id="$page_num" style="position:relative; height:$img_height;"/>526 # my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" )527 my $new_dom = Mojo::DOM->new_tag('div', style => "position: relative; height: ".$img_height."px;" );528 #$self->_debug_print_html($new_dom);529 $new_dom->at('div')->append_content($style_element)->root;530 531 532 #$self->_debug_print_html($new_dom);533 # Copy across all the old html's body tag's child nodes into the new dom's new div tag534 $dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string535 #$self->_debug_print_html($new_dom);536 537 538 # build up the outer div with the <h>tags for sectionalising539 my $inner_div_str = $new_dom->to_string;540 541 my $page_div = "<div id=\"page".$page_num."\">\n";542 # Append a page range bucket heading if applicable: if we have more than 10 pages543 # to display in the current bucket AND we're on the first page of each bucket of 10 pages.544 # Dr Bainbridge thinks for now we need only consider PDFs where the545 # total number of pages < 1000 and create buckets of size 10 (e.g. 1-10, ... 51-60, ...)546 # If number of remaining pages >= 10, then create new bucket heading547 # e.g. "Pages 30-40"548 if(($page_num % 10) == 1 && ($num_html_pages - $page_num) > 10) {549 # Double-digit page numbers that start with 2550 # i.e. 21 to 29 (and 30) should be in 21 to 30 range551 my $start_range = $page_num - ($page_num % 10) + 1;552 my $end_range = $page_num + 10 - ($page_num % 10);553 $page_div .= "<h2 style=\"font-size:1em;font-weight:normal;\">Pages ".$start_range . "-" . $end_range."</h2>\n";554 }555 556 # No sectionalising for 10 pages or under. Otherwise, every page is a section too, not just buckets557 if($num_html_pages > 10) {558 # Whether we're starting a new bucket or not, add a simpler heading: just the pagenumber, "Page #"559 $page_div .= "<h3 style=\"font-size:1em;font-weight:normal;\">Page ".$page_num."</h3>\n";560 }561 562 $page_div .= $inner_div_str;563 $page_div .= "\n</div>";564 565 # Finished processing a single html page of the paged_html output generated by566 # Xpdf's pdftohtml: finished massaging that single html page into the right form567 return $page_div;568 }569 570 # This subroutine is called to do the PDFPlugin post-processing for all cases571 # except the "paged_html" conversion mode. This is what PDFPlugin always used to do:572 sub default_convert_post_process573 {574 my $self = shift (@_);575 my ($conv_filename) = @_;576 293 my $outhandle=$self->{'outhandle'}; 577 294
Note:
See TracChangeset
for help on using the changeset viewer.