- Timestamp:
- 2018-06-21T21:41:12+12:00 (6 years ago)
- Location:
- main/trunk/greenstone2
- Files:
-
- 211 added
- 5 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/bin/script/gsConvert.pl
r30724 r32205 323 323 324 324 # Attempt conversion to HTML 325 if (!$output_type || ($output_type =~ m/html/i)) { 325 # Uses the old pdftohtml that doesn't work for newer PDF versions 326 #if ($output_type =~ m/^html/i) { 327 if (!$output_type || ($output_type =~ m/^html/i)) { 326 328 $success = &pdf_to_html($dirname, $input_filename, $output_filestem); 327 329 if ($success) { 328 330 return "html"; 331 } 332 } 333 334 # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This 335 # will be the new default for PDFs when output_type for PDF docs is not specified 336 # (once our use of xpdftools' pdftohtml has been implemented on win and mac). 337 if ($output_type =~ m/paged_html/i) { 338 #if (!$output_type || ($output_type =~ m/paged_html/i)) { 339 $success = &xpdf_to_html($dirname, $input_filename, $output_filestem); 340 if ($success) { 341 return "paged_html"; 329 342 } 330 343 } … … 756 769 757 770 758 # Convert a pdf file to html with the pdftohtml command759 771 # Convert a pdf file to html with the old pdftohtml command 772 # which only works for older PDF versions 760 773 sub pdf_to_html { 761 774 my ($dirname, $input_filename, $output_filestem) = @_; … … 819 832 return 1; 820 833 } 834 835 836 # Convert a pdf file to html with the newer Xpdftools' pdftohtml 837 # This generates "paged HTML" where extracted, selectable text is positioned 838 # over screenshots of each page. 839 # Since xpdf's pdftohtml fails if the output dir already exists and for easier 840 # naming, the output files are created in a "pages" subdirectory of the tmp 841 # location parent of $output_filestem instead 842 sub xpdf_to_html { 843 my ($dirname, $input_filename, $output_filestem) = @_; 844 845 my $cmd = ""; 846 847 # build up the path to the doc-to-html conversion tool we're going to use 848 my $xpdf_pdftohtml = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools"); 849 850 if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 851 # TODO 852 } elsif ($ENV{'GSDLOS'} =~ m/^darwin$/i) { 853 # TODO 854 } else { # unix, use the appropriate bin folder for the bitness of the system 855 856 # Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since 857 # $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit 858 # specific subdirectories exist in a greenstone installation. 859 # None of those locations need exist when xpdf-tools is installed with GS. 860 # So don't depend on GSDLARCH as forcing that to be exported has side-effects 861 if($ENV{'BITNESS'}) { 862 $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin".$ENV{'BITNESS'}); 863 } else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64? 864 $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32"); 865 } 866 } 867 868 # We'll create the file by name $output_filestem during post-conversion processing. 869 # Note that Xpdf tools will only create its conversion products in a dir that does 870 # not yet exist. So we'll create this location as a subdir of the output_filestem's 871 # parent directory. The parent dir is the already generated tmp area for conversion. So: 872 # - tmpdir gs2build/tmp/<random-num> already exists at this stage 873 # - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing 874 # - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there. 875 my ($tailname, $tmp_dirname, $suffix) 876 = &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$"); 877 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages"); 878 879 $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "pdftohtml"); 880 # xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100% 881 $cmd .= "\"$xpdf_pdftohtml\""; 882 $cmd .= " -z $pdf_zoom" if ($pdf_zoom); 883 # $cmd .= " -c" if ($pdf_complex); 884 # $cmd .= " -i" if ($pdf_ignore_images); 885 # $cmd .= " -a" if ($pdf_allow_images_only); 886 # $cmd .= " -hidden" unless ($pdf_nohidden); 887 $cmd .= " \"$input_filename\" \"$tmp_dirname\""; 888 #$cmd .= " \"$input_filename\" \"$output_filestem\""; 889 890 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) { 891 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\""; 892 } else { 893 $cmd .= " > \"$output_filestem.err\""; 894 } 895 896 #print STDERR "@@@@ Running command: $cmd\n"; 897 898 $!=0; 899 my $retval=system($cmd); 900 if ($retval!=0) 901 { 902 print STDERR "Error executing xpdf's pdftohtml tool"; 903 if ($!) {print STDERR ": $!";} 904 print STDERR "\n"; 905 } 906 907 # make sure the converter made something 908 if ($retval!=0 || ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html")) 909 { 910 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out"); 911 # print out the converter's std err, if any 912 if (-s "$output_filestem.err") { 913 open (ERRLOG, "$output_filestem.err") || die "$!"; 914 print STDERR "pdftohtml error log:\n"; 915 while (<ERRLOG>) { 916 print STDERR "$_"; 917 } 918 close ERRLOG; 919 } 920 #print STDERR "***********output filestem $output_filestem.html\n"; 921 &FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname"); 922 if (-e "$output_filestem.err") { 923 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) 924 { 925 open (ERRLOG, "$output_filestem.err"); 926 while (<ERRLOG>) {print FAILLOG $_;} 927 close ERRLOG; 928 close FAILLOG; 929 } 930 &FileUtils::removeFiles("$output_filestem.err"); 931 } 932 return 0; 933 } 934 935 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err"); 936 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out"); 937 return 1; 938 } 939 940 821 941 822 942 # Convert a pdf file to various types of image with the convert command -
main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm
r31766 r32205 161 161 } 162 162 163 if ($convert_to =~ /^html/ ) { # may be html or html_multi163 if ($convert_to =~ /^html/ || $convert_to eq "paged_html") { # may be html or html_multi, or paged_html with the new Xpdf's own pdftohtml 164 164 $self->{'convert_to_plugin'} = "HTMLPlugin"; 165 165 $self->{'convert_to_ext'} = "html"; … … 349 349 $output_filename = $tmp_dirname . "\/$utf8_tailname\/" . $utf8_tailname . ".$output_type"; 350 350 } 351 } elsif ($output_type eq "paged_html") { 352 $output_filename =~ s/$lc_suffix$/.html/; 351 353 } else { 352 354 $output_filename =~ s/$lc_suffix$/.$output_type/; … … 371 373 372 374 if ("$conv_filename" eq "") {return -1;} # had an error, will be passed down pipeline 373 if (! -e "$conv_filename") {return -1;} 375 376 # We used to return -1 here if $conv_filename didn't exist at this stage 377 # However, for "paged_html" convert_to mode, the converted HTML file $conv_filename 378 # will only be created from conversion products *after* convert_post_process() returns 379 my $output_type=$self->{'convert_to'}; 380 if ($output_type ne "paged_html" && ! -e "$conv_filename") {return -1;} 374 381 $self->{'conv_filename'} = $conv_filename; 375 382 $self->convert_post_process($conv_filename); 383 if ($output_type eq "paged_html" && ! -e "$conv_filename") {return -1;} 376 384 377 385 # Run the "fribidi" (http://fribidi.org) Unicode Bidirectional Algorithm program over the converted file -
main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm
r31494 r32205 27 27 use strict; 28 28 no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR) 29 no strict 'subs'; # allow filehandles to be variables and viceversa 29 30 30 31 use ReadTextFile; 31 32 use unicode; 33 use Mojo::DOM; # for HTML parsing 32 34 33 35 use AutoLoadConverters; … … 44 46 { 'name' => "text", 45 47 'desc' => "{ConvertBinaryFile.convert_to.text}" }, 48 { 'name' => "paged_html", 49 'desc' => "{PDFPlugin.convert_to.paged_html}"}, 46 50 { 'name' => "pagedimg_jpg", 47 51 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"}, … … 145 149 146 150 # check convert_to 151 # TODO: Start supporting PDF to txt on Windows if we're going to be using XPDF Tools (incl pdftotext) on Windows/Linux/Mac 147 152 if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) { 148 153 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n"; … … 281 286 my ($conv_filename) = @_; 282 287 288 my $outhandle=$self->{'outhandle'}; 289 # print STDERR "@@@ convert_to: ".$self->{'convert_to'}."\n"; 290 291 if($self->{'convert_to'} eq "paged_html") { 292 # special post-processing for paged_html mode, as HTML pages generated 293 # by xpdf's pdftohtml need to be massaged into the form we want 294 $self->xpdftohtml_convert_post_process($conv_filename); 295 } 296 else { # use PDFPlugin's usual post processing 297 $self->default_convert_post_process($conv_filename); 298 } 299 } 300 301 # Called after gsConvert.pl has been run to convert a PDF to paged_html 302 # using Xpdftools' pdftohtml 303 # This method will do some cleanup of the HTML files produced after XPDF has produced 304 # an HTML doc for each PDF page: it first gets rid of the default index.html. 305 # Instead, it constructs a single html page containing each original HTML page 306 # <body> nested as divs instead, with simple section information inserted at the top 307 # of each 'page' <div> and some further styling customisation. This HTML manipulation 308 # is to be done with the Mojo::DOM perl package. 309 # Note that since xpdf's pdftohtml would have failed if the output dir already 310 # existed and for simpler naming, the output files are created in a new "pages" 311 # subdirectory of the tmp location parent of $conv_filename instead 312 sub xpdftohtml_convert_post_process 313 { 314 my $self = shift (@_); 315 my ($output_filename) = @_; # output_filename = tmp location + filename 316 # if a single html were generated. 317 # We just want the tmp location, append "pages", and read all the html files 318 # in except for index.html. Then we create a new html file by name 319 # $output_filename, which will consist of a slightly modified version of 320 # each of the other html files concatenated together. 321 322 my $outhandle=$self->{'outhandle'}; 323 324 my ($tailname, $tmp_dir, $suffix) 325 = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$"); 326 my $pages_subdir = &FileUtils::filenameConcatenate($tmp_dir, "pages"); 327 328 # Code from util::create_itemfile() 329 # Read in all the files 330 opendir(DIR, $pages_subdir) || die "can't opendir $pages_subdir: $!"; 331 my @page_files = grep {-f "$pages_subdir/$_"} readdir(DIR); 332 closedir DIR; 333 # Sort files in the directory by page_num 334 # files are named index.html, page1.html, page2.html, ..., pagen.html 335 sub page_number { 336 my ($dir) = @_; 337 my ($pagenum) =($dir =~ m/^page(\d+)\.html?$/i); 338 $pagenum = 0 unless defined $pagenum; # index.html will be given pagenum=0 339 return $pagenum; 340 } 341 # sort the files in the directory in the order of page_num rather than lexically. 342 @page_files = sort { page_number($a) <=> page_number($b) } @page_files; 343 344 #my $num_html_pages = (scalar(@page_files) - 1)/2; # skip index file. 345 # For every html file there's an img file, so halve the total num. 346 # What about other file types that may potentially be there too??? 347 my $num_html_pages = 0; 348 foreach my $pagefile (@page_files) { 349 $num_html_pages++ if $pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i; 350 } 351 352 # Prepare to create our new html page that will contain all the individual 353 # htmls generated by xpdf's pdftohtml in sequence. 354 # First write the opening html tags out to the output file. These are the 355 # same tags and their contents, including <meta>, as is generated by 356 # Xpdf's pdftohtml for each of its individual html pages. 357 my $start_text = "<html>\n<head>\n"; 358 $start_text .= "<title>$tailname</title>\n"; 359 $start_text .= "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n"; 360 $start_text .= "</head>\n<body>\n\n"; 361 362 #handle content encodings the same way that default_convert_post_process does 363 # $self->utf8_write_file ($start_text, $conv_filename); # will close file after write 364 # Don't want to build a giant string in memory of all the pages concatenated 365 # and then write it out in one go. Instead, build up the final single page 366 # by writing each modified paged_html file out to it as this is processed. 367 # Copying file open/close code from CommonUtil::utf8_write_file() 368 if (!open (OUTFILE, ">:utf8", $output_filename)) { 369 gsprintf(STDERR, "PDFPlugin::xpdftohtml_convert_post_process {ConvertToPlug.could_not_open_for_writing} ($!)\n", $output_filename); 370 die "\n"; 371 } 372 print OUTFILE $start_text; 373 374 # Get the contents of each individual HTML page generated by Xpdf, after first 375 # modifying each, and write each out into our single all-encompassing html 376 foreach my $pagefile (@page_files) { 377 if ($pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i) { 378 my $page_num = page_number($pagefile); 379 # get full path to pagefile 380 $pagefile = &FileUtils::filenameConcatenate($pages_subdir, $pagefile); 381 # print STDERR "@@@ About to process html file $pagefile (num $page_num)\n"; 382 my $modified_page_contents = $self->_process_paged_html_page($pagefile, $page_num, $num_html_pages); 383 print OUTFILE "$modified_page_contents\n\n"; 384 } 385 } 386 387 # we've now created a single HTML file by concatenating (a modified version) 388 # of each paged html file 389 print OUTFILE "</body>\n</html>\n"; # write out closing tags 390 close OUTFILE; # done 391 392 # Get rid of all the htm(l) files incl index.html in the associated "pages" 393 # subdir, since we've now processed them all into a single html file 394 # one folder level up and we don't want HTMLPlugin to process all of them next. 395 # my @fullpath_page_files = map { &FileUtils::filenameConcatenate($pages_subdir, $_) } @page_files; 396 &FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); # no specific whitelist, but blacklist htm(l) 397 398 # now the tmp area should contain a single html file contain all the html pages' 399 # contents in sequence, and a "pages" subdir containing the screenshot images 400 # of each page. 401 # HTMLPlugin will process these further in the plugin pipeline 402 } 403 404 # For whatever reason, most html <tags> don't get printed out in GLI 405 # So when debugging, use this function to print them out as [tags] instead. 406 sub _debug_print_html 407 { 408 my $self = shift (@_); 409 my ($string_or_dom) = @_; 410 411 # can't seem to determine type of string with ref/reftype 412 # https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable 413 414 # $dom objects appear to get correctly stringified in string contexts 415 # $dom.to_string/$dom.stringify seem to get called, no need to call them 416 # https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl 417 my $escapedTxt = $string_or_dom; 418 $escapedTxt =~ s@\<@[@sg; 419 $escapedTxt =~ s@\>@]@sg; 420 421 print STDERR "#### $escapedTxt\n"; 422 } 423 424 # Helper function to read in each paged_html generated by Xpdf's pdftohtml 425 # then modify the html suitably using the HTML parsing functions offered by 426 # Mojo::DOM, then return the modified HTML content as a string 427 # See https://mojolicious.org/perldoc/Mojo/DOM 428 sub _process_paged_html_page 429 { 430 my $self = shift (@_); 431 my ($pagefile, $page_num, $num_html_pages) = @_; 432 433 my $text = ""; 434 435 # handling content encoding the same way default_convert_post_process does 436 $self->read_file ($pagefile, "utf8", "", \$text); 437 438 my $dom = Mojo::DOM->new($text); 439 440 # $self->_debug_print_html($dom); 441 442 # there's a <style> element on the <html>, we need to shift it into the <div> 443 # tag that we'll be creating. We'll first slightly modify the <style> element 444 # store the first style element, which is the only one and in the <body> 445 # we'll later insert it as child of an all-encompassing div that we'll create 446 # my $page_style_tag_str = $dom->find('style')->[0]->to_string; 447 # my $page_style_tag_str = $dom->find('html style')->[0]->to_string; 448 my $page_style_tag_str = $dom->at('html')->at('style')->to_string; 449 # In the style tag, convert id style references to class style references 450 my $css_class = ".p".$page_num."f"; 451 $page_style_tag_str =~ s@\#f@$css_class@sg; 452 my $style_element = Mojo::DOM->new($page_style_tag_str)->at('style'); # modified 453 #$self->_debug_print_html($style_element); 454 455 # need to know the image's height to set the height of the surrounding 456 # div that's to replace this page's <body>: 457 my $img_height = $dom->find('img')->[0]{height}; 458 459 460 # 1. Fix up the style attr on the image by additionally setting z-index=-1 for it 461 # 2. Adjust the img#background src attribute to point to the pages subdir for imgs 462 # 3. Set that img tag's class=background, and change its id to background+$page_num 463 my $bg_img_tag=$dom->find('img#background')->[0]; 464 465 my $img_style_str = $bg_img_tag->{style}; # = $dom->find('img#background')->[0]{style} 466 $img_style_str = $img_style_str." z-index=-1;"; 467 #print STDERR "img_style_str: " . $img_style_str."\n"; 468 my $img_src_str = $bg_img_tag->{src}; 469 $img_src_str = "pages/$img_src_str"; 470 $bg_img_tag->attr({style => $img_style_str, src => $img_src_str}); # reset 471 #$self->_debug_print_html($bg_img_tag); 472 # set both class and modified id attributes in one step: 473 $bg_img_tag->attr({class => "background", id => "background".$page_num}); 474 #$self->_debug_print_html($bg_img_tag); 475 476 # get all the <span> nested inside <div class="txt"> elements and 477 # 1. set their class attr to be "p + page_num + id-of-the-span", 478 # 2. then delete the id, because the span ids have been reused when element 479 # ids ought to be unique. Which is why we set the modified ids to be the 480 # value of the class attribute instead 481 $dom->find('div.txt span')->each(sub { 482 $_->attr(class => "p". $page_num. $_->{id}); 483 delete $_->{id}; 484 }); # both changes done in one find() operation 485 #$self->_debug_print_html($dom->find('div.txt span')->last); 486 487 # Finally can create our new dom, starting with a div tag for the current page 488 # Must be: <div id="$page_num" style="position:relative; height:$img_height;"/> 489 my $new_dom = Mojo::DOM->new_tag('div', id => "page".$page_num, style => "position: relative; height: ".$img_height."px;" ); 490 #$self->_debug_print_html($new_dom); 491 $new_dom->at('div')->append_content($style_element)->root; 492 493 # Append a page range bucket heading if applicable 494 # Dr Bainbridge thinks for now we need only consider PDFs where the 495 # total number of pages < 1000 and create buckets of size 10 (e.g. 1-10, ... 51-60, ...) 496 # If number of remaining pages >= 10, then create new bucket heading 497 # e.g. "Pages 30-40" 498 if(($num_html_pages - $page_num) > 10) { 499 # Double-digit page numbers that start with 2 500 # i.e. 21 to 29 (and 30) should be in 21 to 30 range 501 my $start_range = $page_num - ($page_num % 10) + 1; 502 my $end_range = $page_num + 10 - ($page_num % 10); 503 if($page_num % 10 == 0) { # page 20 however, should be in 11 to 20 range 504 $start_range -= 10; 505 $end_range -= 10; 506 } 507 $new_dom->at('div')->append_content($new_dom->new_tag('h1', "Pages ".$start_range . "-" . $end_range))->root; 508 } 509 510 # Add a simpler heading: just the pagenumber, "Page #" 511 $new_dom->at('div')->append_content($new_dom->new_tag('h2', "Page ".$page_num))->root; 512 #$self->_debug_print_html($new_dom); 513 # Copy across all the old html's body tag's child nodes into the new dom's new div tag 514 $dom->at('body')->child_nodes->each(sub { $new_dom->at('div')->append_content($_)}); #$_->to_string 515 #$self->_debug_print_html($new_dom); 516 517 # Finished processing a single html page of the paged_html output generated by 518 # Xpdf's pdftohtml: finished massaging that single html page into the right form 519 return $new_dom->to_string; 520 } 521 522 # This subroutine is called to do the PDFPlugin post-processing for all cases 523 # except the "paged_html" conversion mode. This is what PDFPlugin always used to do: 524 sub default_convert_post_process 525 { 526 my $self = shift (@_); 527 my ($conv_filename) = @_; 283 528 my $outhandle=$self->{'outhandle'}; 284 529 -
main/trunk/greenstone2/perllib/strings.properties
r32112 r32205 1163 1163 PDFPlugin.complex:Create more complex output. With this option set the output html will look much more like the original PDF file. For this to function properly you Ghostscript installed (for *nix gs should be on your path while for windows you must have gswin32c.exe on your path). 1164 1164 1165 PDFPlugin.convert_to.paged_html:A series of HTML pages, one for each page. Each HTML page contains selectable text positionally overlaid on top of a screenshot of the PDF page background comprising any images, tables and drawings. Generated with Xpdf tools. 1166 1165 1167 PDFPlugin.desc:Plugin that processes PDF documents. 1166 1168 … … 1173 1175 PDFPlugin.use_sections:Create a separate section for each page of the PDF file. 1174 1176 1175 PDFPlugin.zoom:The factor by which to zoom the PDF for output (this is only useful if -complex is set).1177 PDFPlugin.zoom:The factor by which to zoom the PDF for output. When not outputting as paged_html, then zoom is only useful if -complex is set. If output is as paged_html, then a zoom factor of 1 means 100 percent. 1176 1178 1177 1179 PostScriptPlugin.desc:This is a \"poor man's\" ps to text converter. If you are serious, consider using the PRESCRIPT package, which is available for download at http://www.nzdl.org/html/software.html -
main/trunk/greenstone2/setup.bash
r32013 r32205 193 193 ;; 194 194 esac 195 196 # for xpdf tools, need to know whether we're using the bin32 or bin64 folder 197 BITNESS=$GSDLARCH 198 export BITNESS 195 199 196 200 # Only want non-trival GSDLARCH value set if there is evidence of
Note:
See TracChangeset
for help on using the changeset viewer.