########################################################################### # # PDFv2Plugin.pm -- pdf plugin that uses xpdftools and pdfbox to process PDFs. # It only works out of the box for GS3 since it assumes the pdfbox extension # is installed. # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999-2001 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package PDFv2Plugin; use strict; no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR) no strict 'subs'; # allow filehandles to be variables and viceversa use ReadTextFile; use unicode; use Mojo::DOM; # for HTML parsing use PDFBoxConverter; use ConvertBinaryFile; @PDFv2Plugin::ISA = ('ConvertBinaryFile', 'PDFBoxConverter', 'ReadTextFile'); my $convert_to_list = [ { 'name' => "auto", # pretty_html using xpdftools' pdftohtml 'desc' => "{ConvertBinaryFile.convert_to.auto}" }, { 'name' => "text", # xpdftools' pdftotext 'desc' => "{ConvertBinaryFile.convert_to.text}" }, { 'name' => "paged_text", # pdfbox 'desc' => "{ConvertBinaryFile.convert_to.paged_text}" }, { 'name' => "html", # pdfbox ## TODO: rename this to html_without_imgs? 'desc' => "{PDFPlugin.convert_to.html}" }, { 'name' => "pretty_html", # xpdftools 'desc' => "{PDFPlugin.convert_to.pretty_html}" }, { 'name' => "paged_pretty_html", # xpdftools 'desc' => "{PDFPlugin.convert_to.paged_pretty_html}"}, # pdfbox for all pagedimg(txt) output formats: { 'name' => "pagedimg_jpg", 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"}, { 'name' => "pagedimg_png", 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"}, { 'name' => "pagedimgtxt_jpg", 'desc' => "{ConvertBinaryFile.convert_to.pagedimgtxt_jpg}"}, { 'name' => "pagedimgtxt_png", 'desc' => "{ConvertBinaryFile.convert_to.pagedimgtxt_png}"}, ]; my $arguments = [ { 'name' => "convert_to", 'desc' => "{ConvertBinaryFile.convert_to}", 'type' => "enum", 'reqd' => "yes", 'list' => $convert_to_list, 'deft' => "paged_pretty_html" }, { 'name' => "process_exp", 'desc' => "{BaseImporter.process_exp}", 'type' => "regexp", 'deft' => &get_default_process_exp(), 'reqd' => "no" }, { 'name' => "block_exp", 'desc' => "{CommonUtil.block_exp}", 'type' => "regexp", 'deft' => &get_default_block_exp() }, # { 'name' => "metadata_fields", # 'desc' => "{HTMLPlugin.metadata_fields}", # 'type' => "string", # 'deft' => "Title,Author,Subject,Keywords" }, # { 'name' => "metadata_field_separator", # 'desc' => "{HTMLPlugin.metadata_field_separator}", # 'type' => "string", # 'deft' => "" }, { 'name' => "dpi", 'desc' => "{PDFv2Plugin.dpi}", 'deft' => "96", 'type' => "int" }, # 72DPI is xpdf's pdftohtml's default. pdfbox' default is 96DPI in headless mode else detected from the screen resolution, see https://pdfbox.apache.org/2.0/commandline.html#pdftoimage # { 'name' => "use_sections", # 'desc' => "{PDFPlugin.use_sections}", # 'type' => "flag" }, # { 'name' => "description_tags", # 'desc' => "{HTMLPlugin.description_tags}", # 'type' => "flag" }, { 'name' => "use_realistic_book", 'desc' => "{PDFPlugin.use_realistic_book}", 'type' => "flag" } ]; my $options = { 'name' => "PDFv2Plugin", 'desc' => "{PDFv2Plugin.desc}", 'abstract' => "no", 'inherits' => "yes", 'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); push(@{$hashArgOptLists->{"OptList"}},$options); my $pdfbox_converter_self = new PDFBoxConverter($pluginlist, $inputargs, $hashArgOptLists); my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists); my $self = BaseImporter::merge_inheritance($pdfbox_converter_self, $cbf_self); # this param order seems necessary to preserve the default/user-selected value for the convert_to option if ($self->{'info_only'}) { # don't worry about any options etc return bless $self, $class; } $self = bless $self, $class; $self->{'file_type'} = "PDF"; # convert_options are passed through to gsConvert.pl by ConvertBinaryFile.pm # the most important option is the tool that's used to do the conversion $self->{'convert_options'} = "-pdf_tool xpdftools"; # default for PDFv2Plugin. If pdfbox_conversion is on, the pdfbpox GS extension sets pdf_tool to pdfbox # Setting dpi has meaning for xpdftools pdftohtml (so paged_pretty_html and pretty_html) # and for when pdfbox outputs an image for each page (pagedimg, pagedimgtxt). # dpi has no effect on (paged_)text and html output modes. my $dpi = $self->{"dpi"}; $self->{'convert_options'} .= " -pdf_dpi $dpi"; # The old pdftohtml tool used by PDFPlugin didn't do PDF to txt conversion on Windows # But PDFv2Plugin now supports PDF to txt conversion on Windows too using XPDFTools' pdftotext if ($self->{'convert_to'} eq "auto") { # defaulting to paged_pretty_html, as it's the best default option when using xpdftools $self->{'convert_to'} = "paged_pretty_html"; &gsprintf::gsprintf(STDERR, "PDFv2Plugin: {PDFv2Plugin.auto_output_default}\n", $self->{'convert_to'}); } if ($self->{'use_realistic_book'}) { if ($self->{'convert_to'} ne "html") { &gsprintf::gsprintf(STDERR, "PDFv2Plugin: {PDFPlugin.html_for_realistic_book}\n"); $self->{'convert_to'} = "html"; } } # set convert_to_plugin and convert_to_ext $self->set_standard_convert_settings(); my $secondary_plugin_name = $self->{'convert_to_plugin'}; my $secondary_plugin_options = $self->{'secondary_plugin_options'}; if (!defined $secondary_plugin_options->{$secondary_plugin_name}) { $secondary_plugin_options->{$secondary_plugin_name} = []; } my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; my $associate_tail_re = $self->{'associate_tail_re'}; if ((defined $associate_tail_re) && ($associate_tail_re ne "")) { push(@$specific_options, "-associate_tail_re", $associate_tail_re); } push(@$specific_options, "-file_rename_method", "none"); if ($secondary_plugin_name eq "HTMLPlugin") { # pdftohtml always produces utf8 - What about pdfbox??? # push(@$specific_options, "-input_encoding", "utf8"); push(@$specific_options, "-extract_language") if $self->{'extract_language'}; push(@$specific_options, "-processing_tmp_files"); # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj) # to extract these metadata fields from the HEAD META fields if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) { push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'}); } else { push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author"); } if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) { push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'}); } if ($self->{'use_sections'} || $self->{'description_tags'}) { $self->{'description_tags'} = 1; push(@$specific_options, "-description_tags"); } if ($self->{'use_realistic_book'}) { push(@$specific_options, "-use_realistic_book"); } if($self->{'convert_to'} eq "paged_pretty_html") { # for paged pretty html, the default should be to sectionalise # the single superpage, the one containing divs representing individual pages as sections, on headings push(@$specific_options, "sectionalise_using_h_tags"); } } elsif ($secondary_plugin_name eq "PagedImagePlugin") { push(@$specific_options, "-screenviewsize", "1000"); push(@$specific_options, "-enable_cache"); push(@$specific_options, "-processing_tmp_files"); } $self = bless $self, $class; $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists); return $self; } sub get_default_process_exp { my $self = shift (@_); return q^(?i)\.pdf$^; } # so we don't inherit HTMLPlug's block exp... sub get_default_block_exp { return ""; } sub init { my $self = shift (@_); # ConvertBinaryFile init $self->SUPER::init(@_); $self->PDFBoxConverter::init(@_); } sub begin { my $self = shift (@_); $self->PDFBoxConverter::begin(@_); $self->SUPER::begin(@_); } sub deinit { my $self = shift (@_); $self->PDFBoxConverter::deinit(@_); $self->SUPER::deinit(@_); } # By setting hashing to be on ga xml this ensures that two # PDF files that are identical except for the metadata # to hash to different values. Without this, when each PDF # file is converted to HTML there is a chance that they # will both be *identical* if the conversion utility does # not embed the metadata in the generated HTML. This is # certainly the case when PDFBOX is being used. # This change makes this convert to based plugin more # consistent with the original vision that the same document # with different metadata should # be seen as different. sub get_oid_hash_type { my $self = shift (@_); return "hash_on_ga_xml"; } sub tmp_area_convert_file { my $self = shift (@_); my ($output_ext, $input_filename, $textref) = @_; if($self->{'convert_to'} eq "text" || $self->{'convert_to'} =~ m/pretty_html$/) { # use xpdftools return $self->ConvertBinaryFile::tmp_area_convert_file(@_); } # for all other output formats, use pdfbox: # Here, we now do directly what AutoLoadConverters::tmp_area_convert_file(@_) # does with PDFBoxConverter: my ($result, $result_str, $new_filename) = $self->PDFBoxConverter::convert($input_filename, $output_ext); if (defined $result && $result != 0) { return $new_filename; } my $outhandle=$self->{'outhandle'}; print $outhandle "PDFBoxConverter had a conversion error\n"; print $outhandle "$@\n"; if (defined $result_str) { print $outhandle "$result_str\n"; } return ""; } # Overriding to do some extra handling for pretty_html/paged_pretty_html output mode sub run_conversion_command { my $self = shift (@_); my ($tmp_dirname, $tmp_inputPDFname, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_; if($self->{'convert_to'} !~ m/pretty_html$/) { return $self->ConvertBinaryFile::run_conversion_command(@_); } # else, paged_pretty_html or pretty_html # if output mode is (paged_)pretty_html, we use Xpdf tools' pdftohtml and tell it # to create a subdir called "pages" in the tmp area to puts its products # in there. (Xpdf's pdftohtml needs to be passed a *non-existent* directory # parameter, the "pages" subdir). If Xpdf's pdftohtml has successfully run, # the intermediary output file tmp//pages/index.html should # exist (besides other output products there) # We let ConvertBinaryFile proceed normally, but the return value should reflect # that on success it should expect the intermediary product tmpdir/pages/index.html # (which is the product of xpdftohtml conversion). my $output_filename = $self->ConvertBinaryFile::run_conversion_command(@_); $output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "pages", "index.html"); # However, when convert_post_process() is done, it should have output the final # product of the (paged_)pretty_html conversion: an html file of the same name and in the # same tmp location as the input PDF file. my ($name_prefix, $output_dir, $ext) = &File::Basename::fileparse($tmp_inputPDFname, "\\.[^\\.]+\$"); $self->{'conv_filename_after_post_process'} = &FileUtils::filenameConcatenate($output_dir, $name_prefix.".html"); # print STDERR "@@@@@ final paged html file will be: " . $self->{'conv_filename_after_post_process'} . "\n"; return $output_filename; } sub convert_post_process { my $self = shift (@_); my ($conv_filename) = @_; my $outhandle=$self->{'outhandle'}; if($self->{'convert_to'} =~ /pretty_html/) { # (paged_)pretty_html # special post-processing for (paged_)pretty_html mode, as HTML pages generated # by xpdf's pdftohtml need to be massaged into the form we want $self->xpdftohtml_convert_post_process($conv_filename); } else { # use original PDFPlugin's usual post processing $self->default_convert_post_process($conv_filename); } } # Called after gsConvert.pl has been run to convert a PDF to (paged_)pretty_html # using Xpdftools' pdftohtml # This method will do some cleanup of the HTML files produced after XPDF has produced # an HTML doc for each PDF page: it first gets rid of the default index.html. # Instead, it constructs a single html page containing each original HTML page # nested as divs instead, with simple section information inserted at the top # of each 'page'
and some further styling customisation. This HTML manipulation # is to be done with the Mojo::DOM perl package. # Note that since xpdf's pdftohtml would have failed if the output dir already # existed and for simpler naming, the output files are created in a new "pages" # subdirectory of the tmp location parent of $conv_filename instead sub xpdftohtml_convert_post_process { my $self = shift (@_); my ($pages_index_html) = @_; # = tmp//pages/index.html for (paged_)pretty_html output mode my $output_filename = $self->{'conv_filename_after_post_process'}; # Read in all the html files in tmp's "pages" subdir, except for index.html. # and use it to create a new html file called $self->{'conv_filename_after_post_process'} # which will consist of a slightly modified version of # each of the other html files concatenated together. my $outhandle=$self->{'outhandle'}; my ($tailname, $pages_subdir, $suffix) = &File::Basename::fileparse($pages_index_html, "\\.[^\\.]+\$"); # Code from util::create_itemfile() # Read in all the files opendir(DIR, $pages_subdir) || die "can't opendir $pages_subdir: $!"; my @page_files = grep {-f "$pages_subdir/$_"} readdir(DIR); closedir DIR; # Sort files in the directory by page_num # files are named index.html, page1.html, page2.html, ..., pagen.html sub page_number { my ($dir) = @_; my ($pagenum) =($dir =~ m/^page(\d+)\.html?$/i); $pagenum = 0 unless defined $pagenum; # index.html will be given pagenum=0 return $pagenum; } # sort the files in the directory in the order of page_num rather than lexically. @page_files = sort { page_number($a) <=> page_number($b) } @page_files; #my $num_html_pages = (scalar(@page_files) - 1)/2; # skip index file. # For every html file there's an img file, so halve the total num. # What about other file types that may potentially be there too??? my $num_html_pages = 0; foreach my $pagefile (@page_files) { $num_html_pages++ if $pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i; } # Prepare to create our new html page that will contain all the individual # htmls generated by xpdf's pdftohtml in sequence. # First write the opening html tags out to the output file. These are the # same tags and their contents, including , as is generated by # Xpdf's pdftohtml for each of its individual html pages. my $start_text = "\n\n"; my ($output_tailname, $tmp_subdir, $html_suffix) = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$"); $start_text .= "$output_tailname\n"; $start_text .= "\n"; $start_text .= "\n\n\n"; if($self->{'convert_to'} =~ /paged_pretty_html/) { # then add the tags for sectionalising $start_text .= "

$output_tailname

\n\n"; } #handle content encodings the same way that default_convert_post_process does # $self->utf8_write_file ($start_text, $conv_filename); # will close file after write # Don't want to build a giant string in memory of all the pages concatenated # and then write it out in one go. Instead, build up the final single page # by writing each modified (paged_)pretty_html file out to it as this is processed. # Copying file open/close code from CommonUtil::utf8_write_file() if (!open (OUTFILE, ">:utf8", $output_filename)) { &gsprintf::gsprintf(STDERR, "PDFv2Plugin::xpdftohtml_convert_post_process {CommonUtil.could_not_open_for_writing} ($!)\n", $output_filename); die "\n"; } print OUTFILE $start_text; # Get the contents of each individual HTML page generated by Xpdf, after first # modifying each, and write each out into our single all-encompassing html foreach my $pagefile (@page_files) { if ($pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i) { my $page_num = page_number($pagefile); # get full path to pagefile $pagefile = &FileUtils::filenameConcatenate($pages_subdir, $pagefile); # print STDERR "@@@ About to process html file $pagefile (num $page_num)\n"; my $modified_page_contents = $self->_process_pretty_html_page($pagefile, $page_num, $num_html_pages); print OUTFILE "$modified_page_contents\n\n"; } } # we've now created a single HTML file by concatenating (a modified version) # of each paged html file print OUTFILE "\n\n"; # write out closing tags close OUTFILE; # done # Get rid of all the htm(l) files incl index.html in the associated "pages" # subdir, since we've now processed them all into a single html file # one folder level up and we don't want HTMLPlugin to process all of them next. &FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); # no specific whitelist, but blacklist htm(l) # now the tmp area should contain a single html file contain all the html pages' # contents in sequence, and a "pages" subdir containing the screenshot images # of each page. # HTMLPlugin will process these further in the plugin pipeline } # For whatever reason, most html don't get printed out in GLI # So when debugging, use this function to print them out as [tags] instead. sub _debug_print_html { my $self = shift (@_); my ($string_or_dom) = @_; # can't seem to determine type of string with ref/reftype # https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable # Not needed, as $dom objects seem to get correctly stringified in string contexts # $dom.to_string/$dom.stringify seem to get called, no need to call them # https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl my $escapedTxt = $string_or_dom; $escapedTxt =~ s@\<@[@sg; $escapedTxt =~ s@\>@]@sg; print STDERR "#### $escapedTxt\n"; } # Helper function for (paged_)pretty_html # to read in each page of pretty_html generated by Xpdf's pdftohtml # then modify the html suitably using the HTML parsing functions offered by # Mojo::DOM, then return the modified HTML content as a string. # For paged_pretty_html, some additional modification is done to sectionalise the final html # See https://mojolicious.org/perldoc/Mojo/DOM sub _process_pretty_html_page { my $self = shift (@_); my ($pagefile, $page_num, $num_html_pages) = @_; my $text = ""; # handling content encoding the same way default_convert_post_process does $self->read_file ($pagefile, "utf8", "", \$text); my $dom = Mojo::DOM->new($text); # $self->_debug_print_html($dom); # there's a