Changeset 32205 for main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm
- Timestamp:
- 2018-06-21T21:41:12+12:00 (6 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm
r31494 r32205 27 27 use strict; 28 28 no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR) 29 no strict 'subs'; # allow filehandles to be variables and viceversa 29 30 30 31 use ReadTextFile; 31 32 use unicode; 33 use Mojo::DOM; # for HTML parsing 32 34 33 35 use AutoLoadConverters; … … 44 46 { 'name' => "text", 45 47 'desc' => "{ConvertBinaryFile.convert_to.text}" }, 48 { 'name' => "paged_html", 49 'desc' => "{PDFPlugin.convert_to.paged_html}"}, 46 50 { 'name' => "pagedimg_jpg", 47 51 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"}, … … 145 149 146 150 # check convert_to 151 # TODO: Start supporting PDF to txt on Windows if we're going to be using XPDF Tools (incl pdftotext) on Windows/Linux/Mac 147 152 if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) { 148 153 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n"; … … 281 286 my ($conv_filename) = @_; 282 287 288 my $outhandle=$self->{'outhandle'}; 289 # print STDERR "@@@ convert_to: ".$self->{'convert_to'}."\n"; 290 291 if($self->{'convert_to'} eq "paged_html") { 292 # special post-processing for paged_html mode, as HTML pages generated 293 # by xpdf's pdftohtml need to be massaged into the form we want 294 $self->xpdftohtml_convert_post_process($conv_filename); 295 } 296 else { # use PDFPlugin's usual post processing 297 $self->default_convert_post_process($conv_filename); 298 } 299 } 300 301 # Called after gsConvert.pl has been run to convert a PDF to paged_html 302 # using Xpdftools' pdftohtml 303 # This method will do some cleanup of the HTML files produced after XPDF has produced 304 # an HTML doc for each PDF page: it first gets rid of the default index.html. 305 # Instead, it constructs a single html page containing each original HTML page 306 # <body> nested as divs instead, with simple section information inserted at the top 307 # of each 'page' <div> and some further styling customisation. This HTML manipulation 308 # is to be done with the Mojo::DOM perl package. 309 # Note that since xpdf's pdftohtml would have failed if the output dir already 310 # existed and for simpler naming, the output files are created in a new "pages" 311 # subdirectory of the tmp location parent of $conv_filename instead 312 sub xpdftohtml_convert_post_process 313 { 314 my $self = shift (@_); 315 my ($output_filename) = @_; # output_filename = tmp location + filename 316 # if a single html were generated. 317 # We just want the tmp location, append "pages", and read all the html files 318 # in except for index.html. Then we create a new html file by name 319 # $output_filename, which will consist of a slightly modified version of 320 # each of the other html files concatenated together. 321 322 my $outhandle=$self->{'outhandle'}; 323 324 my ($tailname, $tmp_dir, $suffix) 325 = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$"); 326 my $pages_subdir = &FileUtils::filenameConcatenate($tmp_dir, "pages"); 327 328 # Code from util::create_itemfile() 329 # Read in all the files 330 opendir(DIR, $pages_subdir) || die "can't opendir $pages_subdir: $!"; 331 my @page_files = grep {-f "$pages_subdir/$_"} readdir(DIR); 332 closedir DIR; 333 # Sort files in the directory by page_num 334 # files are named index.html, page1.html, page2.html, ..., pagen.html 335 sub page_number { 336 my ($dir) = @_; 337 my ($pagenum) =($dir =~ m/^page(\d+)\.html?$/i); 338 $pagenum = 0 unless defined $pagenum; # index.html will be given pagenum=0 339 return $pagenum; 340 } 341 # sort the files in the directory in the order of page_num rather than lexically. 342 @page_files = sort { page_number($a) <=> page_number($b) } @page_files; 343 344 #my $num_html_pages = (scalar(@page_files) - 1)/2; # skip index file. 345 # For every html file there's an img file, so halve the total num. 346 # What about other file types that may potentially be there too??? 347 my $num_html_pages = 0; 348 foreach my $pagefile (@page_files) { 349 $num_html_pages++ if $pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i; 350 } 351 352 # Prepare to create our new html page that will contain all the individual 353 # htmls generated by xpdf's pdftohtml in sequence. 354 # First write the opening html tags out to the output file. These are the 355 # same tags and their contents, including <meta>, as is generated by 356 # Xpdf's pdftohtml for each of its individual html pages. 357 my $start_text = "<html>\n<head>\n"; 358 $start_text .= "<title>$tailname</title>\n"; 359 $start_text .= "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n"; 360 $start_text .= "</head>\n<body>\n\n"; 361 362 #handle content encodings the same way that default_convert_post_process does 363 # $self->utf8_write_file ($start_text, $conv_filename); # will close file after write 364 # Don't want to build a giant string in memory of all the pages concatenated 365 # and then write it out in one go. Instead, build up the final single page 366 # by writing each modified paged_html file out to it as this is processed. 367 # Copying file open/close code from CommonUtil::utf8_write_file() 368 if (!open (OUTFILE, ">:utf8", $output_filename)) { 369 gsprintf(STDERR, "PDFPlugin::xpdftohtml_convert_post_process {ConvertToPlug.could_not_open_for_writing} ($!)\n", $output_filename); 370 die "\n"; 371 } 372 print OUTFILE $start_text; 373 374 # Get the contents of each individual HTML page generated by Xpdf, after first 375 # modifying each, and write each out into our single all-encompassing html 376 foreach my $pagefile (@page_files) { 377 if ($pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i) { 378 my $page_num = page_number($pagefile); 379 # get full path to pagefile 380 $pagefile = &FileUtils::filenameConcatenate($pages_subdir, $pagefile); 381 # print STDERR "@@@ About to process html file $pagefile (num $page_num)\n"; 382 my $modified_page_contents = $self->_process_paged_html_page($pagefile, $page_num, $num_html_pages); 383 print OUTFILE "$modified_page_contents\n\n"; 384 } 385 } 386 387 # we've now created a single HTML file by concatenating (a modified version) 388 # of each paged html file 389 print OUTFILE "</body>\n</html>\n"; # write out closing tags 390 close OUTFILE; # done 391 392 # Get rid of all the htm(l) files incl index.html in the associated "pages" 393 # subdir, since we've now processed them all into a single html file 394 # one folder level up and we don't want HTMLPlugin to process all of them next. 395 # my @fullpath_page_files = map { &FileUtils::filenameConcatenate($pages_subdir, $_) } @page_files; 396 &FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); # no specific whitelist, but blacklist htm(l) 397 398 # now the tmp area should contain a single html file contain all the html pages' 399 # contents in sequence, and a "pages" subdir containing the screenshot images 400 # of each page. 401 # HTMLPlugin will process these further in the plugin pipeline 402 } 403 404 # For whatever reason, most html <tags> don't get printed out in GLI 405 # So when debugging, use this function to print them out as [tags] instead. 406 sub _debug_print_html 407 { 408 my $self = shift (@_); 409 my ($string_or_dom) = @_; 410 411 # can't seem to determine type of string with ref/reftype 412 # https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable 413 414 # $dom objects appear to get correctly stringified in string contexts 415 # $dom.to_string/$dom.stringify seem to get called, no need to call them 416 # https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl 417 my $escapedTxt = $string_or_dom; 418 $escapedTxt =~ s@\<@[@sg; 419 $escapedTxt =~ s@\>@]@sg; 420 421 print STDERR "#### $escapedTxt\n"; 422 } 423 424 # Helper function to read in each paged_html generated by Xpdf's pdftohtml 425 # then modify the html suitably using the HTML parsing functions offered by 426 # Mojo::DOM, then return the modified HTML content as a string 427 # See https://mojolicious.org/perldoc/Mojo/DOM 428 sub _process_paged_html_page 429 { 430 my $self = shift (@_); 431 my ($pagefile, $page_num, $num_html_pages) = @_; 432 433 my $text = ""; 434 435 # handling content encoding the same way default_convert_post_process does 436 $self->read_file ($pagefile, "utf8", "", \$text); 437 438 my $dom = Mojo::DOM->new($text); 439 440 # $self->_debug_print_html($dom); 441 442 # there's a <style> element on the <html>, we need to shift it into the <div> 443 # tag that we'll be creating. We'll first slightly modify the <style> element 444 # store the first style element, which is the only one and in the <body> 445 # we'll later insert it as child of an all-encompassing div that we'll create 446 # my $page_style_tag_str = $dom->find('style')->[0]->to_string; 447 # my $page_style_tag_str = $dom->find('html style')->[0]->to_string; 448 my $page_style_tag_str = $dom->at('html')->at('style')->to_string; 449 # In the style tag, convert id style references to class style references 450 my $css_class = ".p".$page_num."f"; 451 $page_style_tag_str =~ s@\#f@$css_class@sg; 452 my $style_element = Mojo::DOM->new($page_style_tag_str)->at('style'); # modified 453 #$self->_debug_print_html($style_element); 454 455 # need to know the image's height to set the height of the surrounding 456 # div that's to replace this page's <body>: 457 my $img_height = $dom->find('img')->[0]{height}; 458 459 460 # 1. Fix up the style attr on the image by additionally setting z-index=-1 for it 461 # 2. Adjust the img#background src attribute to point to the pages subdir for imgs 462 # 3. Set that img tag's class=background, and change its id to background+$page_num 463 my $bg_img_tag=$dom->find('img#background')->[0]; 464 465 my $img_style_str = $bg_img_tag->{style}; # = $dom->find('img#background')->[0]{style} 466 $img_style_str = $img_style_str." z-index=-1;"; 467 #print STDERR "img_style_str: " . $img_style_str."\n"; 468 my $img_src_str = $bg_img_tag->{src}; 469 $img_src_str = "pages/$img_src_str"; 470 $bg_img_tag->attr({style => $img_style_str, src => $img_src_str}); # reset 471 #$self->_debug_print_html($bg_img_tag); 472 # set both class and modified id attributes in one step: 473 $bg_img_tag->attr({class => "background", id => "background".$page_num}); 474 #$self->_debug_print_html($bg_img_tag); 475 476 # get all the <span> nested inside <div class="txt"> elements and 477 # 1. set their class attr to be "p + page_num + id-of-the-span", 478 # 2. then delete the id, because the span ids have been reused when element 479 # ids ought to be unique. Which is why we set the modified ids to be the 480 # value of the class attribute instead 481 $dom->find('div.txt span')->each(sub { 482 $_->attr(class => "p". $page_num. $_->{id}); 483 delete $_->{id}; 484 }); # both changes done in one find() operation 485 #$self->_debug_print_html($dom->find('div.txt span')->last); 486 487 # Finally can create our new dom, starting with a div tag for the current page 488 # Must be: <div id="$page_num" style="position:relative; height:$img_height;"/> 489 my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" ); 490 #$self->_debug_print_html($new_dom); 491 $new_dom->at('div')->append_content($style_element)->root; 492 493 # Append a page range bucket heading if applicable 494 # Dr Bainbridge thinks for now we need only consider PDFs where the 495 # total number of pages < 1000 and create buckets of size 10 (e.g. 1-10, ... 51-60, ...) 496 # If number of remaining pages >= 10, then create new bucket heading 497 # e.g. "Pages 30-40" 498 if(($num_html_pages - $page_num) > 10) { 499 # Double-digit page numbers that start with 2 500 # i.e. 21 to 29 (and 30) should be in 21 to 30 range 501 my $start_range = $page_num - ($page_num % 10) + 1; 502 my $end_range = $page_num + 10 - ($page_num % 10); 503 if($page_num % 10 == 0) { # page 20 however, should be in 11 to 20 range 504 $start_range -= 10; 505 $end_range -= 10; 506 } 507 $new_dom->at('div')->append_content($new_dom->new_tag('h1', "Pages ".$start_range . "-" . $end_range))->root; 508 } 509 510 # Add a simpler heading: just the pagenumber, "Page #" 511 $new_dom->at('div')->append_content($new_dom->new_tag('h2', "Page ".$page_num))->root; 512 #$self->_debug_print_html($new_dom); 513 # Copy across all the old html's body tag's child nodes into the new dom's new div tag 514 $dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string 515 #$self->_debug_print_html($new_dom); 516 517 # Finished processing a single html page of the paged_html output generated by 518 # Xpdf's pdftohtml: finished massaging that single html page into the right form 519 return $new_dom->to_string; 520 } 521 522 # This subroutine is called to do the PDFPlugin post-processing for all cases 523 # except the "paged_html" conversion mode. This is what PDFPlugin always used to do: 524 sub default_convert_post_process 525 { 526 my $self = shift (@_); 527 my ($conv_filename) = @_; 283 528 my $outhandle=$self->{'outhandle'}; 284 529
Note:
See TracChangeset
for help on using the changeset viewer.