Index: /gs2-extensions/malware-checker/trunk/perllib/plugins/MalwareCheckerConverter.pm
===================================================================
--- /gs2-extensions/malware-checker/trunk/perllib/plugins/MalwareCheckerConverter.pm (revision 33676)
+++ /gs2-extensions/malware-checker/trunk/perllib/plugins/MalwareCheckerConverter.pm (revision 33676)
@@ -0,0 +1,356 @@
+###########################################################################
+#
+# MalwareCheckerConverter - allows files to be scanned for malware through VirusTotal API
+#
+# A component of the Greenstone digital library software
+# from the New Zealand Digital Library Project at the
+# University of Waikato, New Zealand.
+#
+# Copyright (C) 2010 New Zealand Digital Library Project
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+###########################################################################
+package MalwareCheckerConverter;
+
+use BaseMediaConverter;
+
+use strict;
+no strict 'refs'; # allow filehandles to be variables and viceversa
+no strict 'subs'; # allow barewords (eg STDERR) as function arguments
+
+#use HTML::Entities; # for encoding characters into their HTML entities when PDFBox converts to text
+
+use gsprintf 'gsprintf';
+use FileUtils;
+
+# these two variables mustn't be initialised here or they will get stuck
+# at those values.
+our $malwarechecker_conversion_available;
+our $no_malwarechecker_conversion_reason;
+
+BEGIN {
+ @MalwareCheckerConverter::ISA = ('BaseMediaConverter');
+
+ # Check that MalwareChecker is installed and available on the path
+ $malwarechecker_conversion_available = 1;
+ $no_malwarechecker_conversion_reason = "";
+
+ if (!defined $ENV{'GEXT_MALWARECHECKER'}) {
+ $malwarechecker_conversion_available = 0;
+ $no_malwarechecker_conversion_reason = "gextpdfboxnotinstalled";
+ }
+ else {
+ my $gextmw_home = $ENV{'GEXT_MALWARECHECKER'};
+ my $pbajar = &FileUtils::filenameConcatenate($gextmw_home,"lib","java","malware-checker-1.0-SNAPSHOT-jar-with-dependencies.jar");
+
+ if (!-e $pbajar) {
+ &gsprintf(STDERR,"**** Failed to find $pbajar\n");
+ $malwarechecker_conversion_available = 0;
+ $no_malwarechecker_conversion_reason = "gextmalwherecheckerjarnotinstalled";
+ }
+ else {
+ # test to see if java is in path
+ # Need to run java -version instead of just java, since the %ERRORLEVEL% returned
+ # for `java` (which is checked below for failure of the command) is 0 for JDK 1.6*
+ # while %ERRORLEVEL% is 1 for JDK 1.7*
+ # If `java -version` is run however, %ERRORLEVEL% returned is 0 if java is
+ # installed, regardless of whether the JDK version is 1.6* or 1.7*.
+ my $java = &util::get_java_command();
+
+ my $cmd = "$java -version";
+ if ($ENV{'GSDLOS'} =~ /^windows/i) {
+ $cmd .= " >nul 2>&1"; # java 2>&1 >null or java >null 2>&1 both work (%ERRORLEVEL% is 0)
+ }
+ else {
+ # On Ubuntu, java >/dev/null 2>&1 works,
+ # but java 2>&1 >/dev/null doesn't work: output goes to screen anyway
+ $cmd .= " >/dev/null 2>&1"; # " >/dev/null 2>&1 &" - don't need & at end for Linux Centos anymore (Ubuntu was already fine without it)
+ }
+
+ my $status = system($cmd);
+
+ if ($status != 0) {
+
+ my $error_message = "**** Testing for java\n";
+ $error_message .= "Failed to run: $cmd\n";
+ $error_message .= "Error variable: |$!| and status: $status\n";
+
+ &gsprintf(STDERR, "MalwareCheckerConverter: $error_message");
+
+ $malwarechecker_conversion_available = 0;
+ $no_malwarechecker_conversion_reason = "couldnotrunjava";
+ }
+ }
+ }
+
+}
+
+my $arguments = [ ];
+
+my $options = { 'name' => "MalwareCheckerConverter",
+ 'desc' => "{MalwareCheckerConverter.desc}",
+ 'abstract' => "yes",
+ 'inherits' => "yes",
+ 'args' => $arguments };
+
+sub new {
+ my ($class) = shift (@_);
+ my ($pluginlist,$inputargs,$hashArgOptLists,$auxilary) = @_;
+ push(@$pluginlist, $class);
+
+ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
+ push(@{$hashArgOptLists->{"OptList"}},$options);
+
+
+ my $self = new BaseMediaConverter($pluginlist, $inputargs,
+ $hashArgOptLists, $auxilary);
+
+ if ($self->{'info_only'}) {
+ # don't worry about any options etc
+ return bless $self, $class;
+ }
+ if ($malwarechecker_conversion_available) {
+ my $gextmw_home = $ENV{'GEXT_MALWARECHECKER'};
+ my $pbajar = &FileUtils::filenameConcatenate($gextmw_home,"lib","java","pdfbox-app.jar");
+ my $pbjbigjar = &FileUtils::filenameConcatenate($gextmw_home,"lib","java","jbig2-imageio-3.0.1.jar");
+ # Not including the following JPEG2000 jar, as it is under commercial license:
+ # https://github.com/jai-imageio/jai-imageio-jpeg2000 leading to https://bintray.com/jai-imageio/maven/jai-imageio-jpeg2000# (Files tab)
+ # my $pbjp2jar = &FileUtils::filenameConcatenate($gextmw_home,"lib","java","jai-imageio-jpeg2000-1.3.0.jar"); # jpeg2000
+ my $java = &util::get_java_command();
+ $self->{'malwarechecker_txt_launch_cmd'} = "$java -cp \"$pbajar\" org.apache.pdfbox.tools.ExtractText";
+ $self->{'malwarechecker_html_launch_cmd'} = "$java -cp \"$pbajar\" -Dline.separator=\"
\" org.apache.pdfbox.tools.ExtractText";
+
+ # We use this next cmd to launch our new custom PDFBox class (PDFBoxToImagesAndText.java) to convert each PDF page into an image (gif, jpg, png)
+ # AND its extracted text. Or just each page's extracted text. An item file is still generated,
+ # but this time referring to txtfiles too, not just the images. Result: searchable paged output.
+ # Our new custom class PDFBoxToImagesAndText.java lives in the new build folder, so add that to the classpath for the launch cmd
+ my $malwarechecker_build = &FileUtils::filenameConcatenate($gextmw_home,"build");
+ # put the pdfbox jar, the jbig2-imageio library (Apache Software License 2.0)
+ # and our build folder containing our custom PDFBox class on the classpath
+ my $classpath = &util::pathname_cat($pbajar, $pbjbigjar, $malwarechecker_build);
+# $self->{'malwarechecker_img_launch_cmd'} = "java -cp \"$classpath\" org.apache.pdfbox.tools.PDFToImage"; # pdfbox 2.09 cmd for converting each PDF page to an image (jpg, png)
+ $self->{'malwarechecker_imgtxt_launch_cmd'} = "java -cp \"$classpath\" org.greenstone.pdfbox.PDFBoxToImagesAndText";
+ }
+ else {
+ $self->{'no_malwarechecker_conversion_reason'} = $no_malwarechecker_conversion_reason;
+
+ my $outhandle = $self->{'outhandle'};
+ &gsprintf($outhandle, "PDFBoxConverter: {PDFBoxConverter.noconversionavailable} ({PDFBoxConverter.$no_malwarechecker_conversion_reason})\n");
+ }
+
+ print STDERR "**** malware check avail = $malwarechecker_conversion_available\n";
+
+ $self->{'malwarechecker_conversion_available'} = $malwarechecker_conversion_available;
+
+ return bless $self, $class;
+
+}
+
+sub init {
+ print STDERR "******* MalwareCheckerConverer init() !!!!!!\n";
+ my $self = shift(@_);
+ my ($verbosity, $outhandle, $failhandle) = @_;
+
+ $self->{'pbtmp_file_paths'} = ();
+
+ # *****
+ $self->CommonUtil::init(@_); # need verbosity set in $self
+ # $self->{'verbosity'} = $verbosity;
+}
+
+sub deinit {
+ my $self = shift(@_);
+
+ $self->clean_up_temporary_files();
+}
+
+
+sub convert {
+ my $self = shift(@_);
+ my ($source_file_full_path, $target_file_type) = @_;
+
+ return 0 unless $malwarechecker_conversion_available;
+ # check the filename
+ return 0 if ( !-f $source_file_full_path);
+
+ # Although PDFBoxConverter inherits from AutoLoadConverters and therefore
+ # doesn't go through gsConvert.pl, still set the -pdf_tool flag in convert_options
+ # in case in future PDFBoxConverter no longer inherits from AutoLoadConverters
+ # and ends up going through gsConvert.pl
+ $self->{'convert_options'} .= " -pdf_tool pdfbox";
+
+ my $img_output_mode = 0;
+
+ my $convert_to = $self->{'convert_to'};
+ my $paged_txt_output_mode = ($convert_to =~ /(pagedimgtxt|paged_text)/) ? 1 : 0;
+
+ # the following line is necessary to avoid 'uninitialised variable' error
+ # messages concerning the converted_to member variable when PDFPlugin's
+ # use_sections option is checked.
+ # PDFBox plugin now processes use_sections option, when working with v1.5.0
+ # of the PDFBox jar file (which embeds each page in special
tags).
+ if ($target_file_type eq "html") {
+ $self->{'converted_to'} = "HTML";
+ } elsif ($target_file_type eq "jpg" || $target_file_type eq "png") { # || $target_file_type eq "gif"
+ # GIF not supported by PDFBox at present, see https://pdfbox.apache.org/1.8/commandline.html#pdftoimage
+ $self->{'converted_to'} = $target_file_type;
+ $img_output_mode = 1;
+ } else {
+ $self->{'converted_to'} = "text";
+ }
+
+ my $outhandle = $self->{'outhandle'};
+ my $verbosity = $self->{'verbosity'};
+
+ my $source_file_no_path = &File::Basename::basename($source_file_full_path);
+ # Determine the full name and path of the output file
+ my $target_file_path;
+ if ($self->{'enable_cache'}) {
+ $self->init_cache_for_file($source_file_full_path);
+ my $cache_dir = $self->{'cached_dir'};
+ my $file_root = $self->{'cached_file_root'};
+ #$file_root .= "_$convert_id" if ($convert_id ne "");
+
+ # append the output filetype suffix only for non-image output formats, since for
+ # images we can be outputting multiple image files per single PDF input file
+ my $target_file = ($img_output_mode || $paged_txt_output_mode) ? "$file_root" : "$file_root.$target_file_type";
+
+ $target_file_path = &FileUtils::filenameConcatenate($cache_dir,$target_file);
+ }
+ else {
+ # this is in gsdl/tmp. get a tmp filename in collection instead???
+ $target_file_path = &util::get_tmp_filename($target_file_type);
+
+ # for image files, remove the suffix, since we can have many output image files
+ # per input PDF (one img for each page of the PDF, for example)
+ if($img_output_mode || $paged_txt_output_mode) {
+ $target_file_path =~ s/\.[^.]*$//g;
+ if(!&FileUtils::directoryExists($target_file_path)) {
+ mkdir($target_file_path);
+ }
+
+ # once the item file for the imgs has been created, need to adjust target_file_path
+
+ # below, we'll store the dir just created to pbtmp_file_paths, so all imgs and the
+ # item file generated in it can be deleted in one go on clean_up
+ }
+
+ push(@{$self->{'pbtmp_file_paths'}}, $target_file_path);
+ }
+
+ # Generate and run the convert command
+ my $convert_cmd = "";
+
+ # want the filename without extension, because any images
+ # are to be generated with the same filename as the PDF
+ my ($tailname, $dirname, $suffix) = &File::Basename::fileparse($source_file_full_path, "\\.[^\\.]+\$");
+
+ if($img_output_mode || $paged_txt_output_mode) { # converting each page to image and/or text
+ my $output_prefix = &FileUtils::filenameConcatenate($target_file_path, $tailname);
+
+ # Our custom class does renaming of the pages (simplified to just numbers) for PagedImagePlugin
+ #$convert_cmd = $paged_txt_output_mode ? $self->{'malwarechecker_imgtxt_launch_cmd'} : $self->{'malwarechecker_img_launch_cmd'};
+ $convert_cmd = $self->{'malwarechecker_imgtxt_launch_cmd'};
+ $convert_cmd .= " -imagesOnly" unless($paged_txt_output_mode); # set to images only unless there's text too
+ if($img_output_mode) { # whether images-only or images-and-text mode
+ $convert_cmd .= " -imageType $target_file_type";
+ $convert_cmd .= " -dpi ". $self->{"dpi"} if defined $self->{"dpi"};
+ } else { # img_output_mode off, so paged txt only and no images
+ $convert_cmd .= " -textOnly";
+ }
+ $convert_cmd .= " -outputPrefix \"$output_prefix\"";
+ $convert_cmd .= " \"$source_file_full_path\"";
+
+ } else { # single stream of text or html
+
+ if ($target_file_type eq "html") {
+ $convert_cmd = $self->{'malwarechecker_html_launch_cmd'};
+ $convert_cmd .= " -html" if ($target_file_type eq "html");
+ } else {
+ $convert_cmd = $self->{'malwarechecker_txt_launch_cmd'};
+ }
+ $convert_cmd .= " \"$source_file_full_path\" \"$target_file_path\"";
+ }
+
+ if ($verbosity>2) {
+ &gsprintf($outhandle,"Convert command: $convert_cmd\n");
+ }
+
+ my $print_info = { 'message_prefix' => "PDFBox Conversion",
+ 'message' => "Converting $source_file_no_path to: $target_file_type" };
+ # $print_info->{'cache_mode'} = $cache_mode if ($cache_mode ne "");
+
+ my ($regenerated,$result,$had_error)
+ = $self->autorun_general_cmd($convert_cmd,$source_file_full_path, $target_file_path,$print_info);
+
+ if($img_output_mode || $paged_txt_output_mode) {
+ # now the images have been generated, generate the "$target_file_path/tailname.item"
+ # item file for them, which is also the target_file_path that needs to be returned
+ $target_file_path = &util::create_itemfile($target_file_path, $tailname, $target_file_type);
+ #print STDERR "**** item file: $target_file_path\n";
+ }
+ elsif ($self->{'converted_to'} eq "text") {
+ # ensure html entities are doubly escaped for pdfbox to text conversion: & -> &
+ # conversion to html does it automatically, but conversion to text doesn't
+ # and this results in illegal characters in doc.xml
+
+ my $fulltext = &FileUtils::readUTF8File($target_file_path);
+ if(defined $fulltext) {
+ #$fulltext = &HTML::Entities::encode($fulltext); # doesn't seem to help
+ $fulltext =~ s@&@&@sg; # Kathy's fix to ensure doc contents don't break XML
+ &FileUtils::writeUTF8File($target_file_path, \$fulltext);
+ } else {
+ print STDERR "PDFBoxConverter::convert(): Unable to read from converted file\n";
+ $had_error = 1;
+ }
+ }
+
+ if ($had_error) {
+ return (0, $result,$target_file_path);
+ }
+ return (1, $result,$target_file_path);
+}
+
+sub convert_without_result {
+ my $self = shift(@_);
+
+ my $source_file_path = shift(@_);
+ my $target_file_type = shift(@_);
+ my $convert_options = shift(@_) || "";
+ my $convert_id = shift(@_) || "";
+
+ return $self->convert($source_file_path,$target_file_type,
+ $convert_options,$convert_id,"without_result");
+}
+
+sub clean_up_temporary_files {
+ my $self = shift(@_);
+
+ foreach my $pbtmp_file_path (@{$self->{'pbtmp_file_paths'}}) {
+ if (-d $pbtmp_file_path) {
+ #print STDERR "@@@@@@ cleanup called on $pbtmp_file_path\n";
+ &FileUtils::removeFilesRecursive($pbtmp_file_path);
+ }
+ elsif (-e $pbtmp_file_path) {
+ &FileUtils::removeFiles($pbtmp_file_path);
+ }
+ }
+
+ $self->{'pbtmp_file_paths'} = ();
+}
+
+
+1;
Index: /gs2-extensions/malware-checker/trunk/perllib/plugins/PDFv3Plugin.pm
===================================================================
--- /gs2-extensions/malware-checker/trunk/perllib/plugins/PDFv3Plugin.pm (revision 33676)
+++ /gs2-extensions/malware-checker/trunk/perllib/plugins/PDFv3Plugin.pm (revision 33676)
@@ -0,0 +1,808 @@
+###########################################################################
+#
+# PDFv3Plugin.pm -- pdf plugin that uses xpdftools and pdfbox to process PDFs.
+# It only works out of the box for GS3 since it assumes the pdfbox extension
+# is installed.
+# A component of the Greenstone digital library software
+# from the New Zealand Digital Library Project at the
+# University of Waikato, New Zealand.
+#
+# Copyright (C) 1999-2001 New Zealand Digital Library Project
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+###########################################################################
+package PDFv3Plugin;
+
+use strict;
+no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR)
+no strict 'subs'; # allow filehandles to be variables and viceversa
+
+use AutoLoadConverters;
+use ConvertBinaryFile;
+use PDFBoxConverter;
+use ReadTextFile;
+
+use unicode;
+use Mojo::DOM; # for HTML parsing
+
+
+@PDFv3Plugin::ISA = ('AutoLoadConverters', 'ConvertBinaryFile', 'PDFBoxConverter', 'ReadTextFile');
+
+my $malwarechecker_available = 0;
+
+my $convert_to_list =
+ [ { 'name' => "auto", # pretty_html using xpdftools' pdftohtml
+ 'desc' => "{ConvertBinaryFile.convert_to.auto}" },
+ { 'name' => "text", # xpdftools' pdftotext
+ 'desc' => "{ConvertBinaryFile.convert_to.text}" },
+ { 'name' => "paged_text", # pdfbox
+ 'desc' => "{ConvertBinaryFile.convert_to.paged_text}" },
+
+ { 'name' => "html", # pdfbox ## TODO: rename this to html_without_imgs?
+ 'desc' => "{PDFPlugin.convert_to.html}" },
+ { 'name' => "pretty_html", # xpdftools
+ 'desc' => "{PDFPlugin.convert_to.pretty_html}" },
+ { 'name' => "paged_pretty_html", # xpdftools
+ 'desc' => "{PDFPlugin.convert_to.paged_pretty_html}"},
+
+ # pdfbox for all pagedimg(txt) output formats:
+ { 'name' => "pagedimg_jpg",
+ 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"},
+ { 'name' => "pagedimg_png",
+ 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"},
+
+ { 'name' => "pagedimgtxt_jpg",
+ 'desc' => "{ConvertBinaryFile.convert_to.pagedimgtxt_jpg}"},
+ { 'name' => "pagedimgtxt_png",
+ 'desc' => "{ConvertBinaryFile.convert_to.pagedimgtxt_png}"},
+ ];
+
+
+my $arguments =
+ [
+ { 'name' => "convert_to",
+ 'desc' => "{ConvertBinaryFile.convert_to}",
+ 'type' => "enum",
+ 'reqd' => "yes",
+ 'list' => $convert_to_list,
+ 'deft' => "paged_pretty_html" },
+ { 'name' => "process_exp",
+ 'desc' => "{BaseImporter.process_exp}",
+ 'type' => "regexp",
+ 'deft' => &get_default_process_exp(),
+ 'reqd' => "no" },
+ { 'name' => "block_exp",
+ 'desc' => "{CommonUtil.block_exp}",
+ 'type' => "regexp",
+ 'deft' => &get_default_block_exp() },
+# { 'name' => "metadata_fields",
+# 'desc' => "{HTMLPlugin.metadata_fields}",
+# 'type' => "string",
+# 'deft' => "Title,Author,Subject,Keywords" },
+# { 'name' => "metadata_field_separator",
+# 'desc' => "{HTMLPlugin.metadata_field_separator}",
+# 'type' => "string",
+# 'deft' => "" },
+ { 'name' => "dpi",
+ 'desc' => "{PDFv2Plugin.dpi}",
+ 'deft' => "96",
+ 'type' => "int" }, # 72DPI is xpdf's pdftohtml's default. pdfbox' default is 96DPI in headless mode else detected from the screen resolution, see https://pdfbox.apache.org/2.0/commandline.html#pdftoimage
+# { 'name' => "use_sections",
+# 'desc' => "{PDFPlugin.use_sections}",
+# 'type' => "flag" },
+# { 'name' => "description_tags",
+# 'desc' => "{HTMLPlugin.description_tags}",
+# 'type' => "flag" },
+ { 'name' => "use_realistic_book",
+ 'desc' => "{PDFPlugin.use_realistic_book}",
+ 'type' => "flag" }
+ ];
+
+my $opt_malwarechecker_args = [ { 'name' => "malware_checker",
+ 'desc' => "{MalwareChecker.malware_checker}",
+ 'type' => "flag",
+ 'reqd' => "no" } ];
+
+
+my $options = { 'name' => "PDFv2Plugin",
+ 'desc' => "{PDFv2Plugin.desc}",
+ 'abstract' => "no",
+ 'inherits' => "yes",
+ 'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html
+ 'args' => $arguments };
+
+sub new {
+ my ($class) = shift (@_);
+ my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
+ push(@$pluginlist, $class);
+
+ push(@{$hashArgOptLists->{"OptList"}},$options);
+
+ # New Malware-Checker
+ my $auto_converter_self = new AutoLoadConverters($pluginlist,$inputargs,$hashArgOptLists,["MalwareCheckerConverter"],1);
+
+ if ($auto_converter_self->{'malwarechecker_available'}) {
+ push(@$arguments,@$opt_malwarechecker_args);
+ $malwarechecker_available = 1;
+ }
+
+ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
+
+ # the 1 at the end of the first constructor call is to allow extra arguments to remain after
+ # parsing. There should be no extra args after the final constructor call, because all args
+ # should have been parsed by then, so don't pass 1 to the last constructor.
+ my $pdfbox_converter_self = new PDFBoxConverter($pluginlist, $inputargs, $hashArgOptLists, 1);
+ my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists);
+ my $self = BaseImporter::merge_inheritance($auto_converter_self, $pdfbox_converter_self, $cbf_self); # this param order seems necessary to preserve the default/user-selected value for the convert_to option
+
+ if ($self->{'info_only'}) {
+ # don't worry about any options etc
+ return bless $self, $class;
+ }
+
+ $self = bless $self, $class;
+ $self->{'file_type'} = "PDF";
+
+ # convert_options are passed through to gsConvert.pl by ConvertBinaryFile.pm
+
+ # the most important option is the tool that's used to do the conversion
+ $self->{'convert_options'} = "-pdf_tool xpdftools"; # default for PDFv2Plugin. If pdfbox_conversion is on, the pdfbpox GS extension sets pdf_tool to pdfbox
+
+ # Setting dpi has meaning for xpdftools pdftohtml (so paged_pretty_html and pretty_html)
+ # and for when pdfbox outputs an image for each page (pagedimg, pagedimgtxt).
+ # dpi has no effect on (paged_)text and html output modes.
+ my $dpi = $self->{"dpi"};
+ $self->{'convert_options'} .= " -pdf_dpi $dpi";
+
+ # The old pdftohtml tool used by PDFPlugin didn't do PDF to txt conversion on Windows
+ # But PDFv2Plugin now supports PDF to txt conversion on Windows too using XPDFTools' pdftotext
+
+ if ($self->{'convert_to'} eq "auto") {
+ # defaulting to paged_pretty_html, as it's the best default option when using xpdftools
+ $self->{'convert_to'} = "paged_pretty_html";
+ &gsprintf::gsprintf(STDERR, "PDFv2Plugin: {PDFv2Plugin.auto_output_default}\n", $self->{'convert_to'});
+ }
+ if ($self->{'use_realistic_book'}) {
+ if ($self->{'convert_to'} ne "html") {
+ &gsprintf::gsprintf(STDERR, "PDFv2Plugin: {PDFPlugin.html_for_realistic_book}\n");
+ $self->{'convert_to'} = "html";
+ }
+ }
+
+ # set convert_to_plugin and convert_to_ext
+ $self->set_standard_convert_settings();
+
+ my $secondary_plugin_name = $self->{'convert_to_plugin'};
+ my $secondary_plugin_options = $self->{'secondary_plugin_options'};
+
+ if (!defined $secondary_plugin_options->{$secondary_plugin_name}) {
+ $secondary_plugin_options->{$secondary_plugin_name} = [];
+ }
+ my $specific_options = $secondary_plugin_options->{$secondary_plugin_name};
+
+ my $associate_tail_re = $self->{'associate_tail_re'};
+ if ((defined $associate_tail_re) && ($associate_tail_re ne "")) {
+ push(@$specific_options, "-associate_tail_re", $associate_tail_re);
+ }
+ push(@$specific_options, "-file_rename_method", "none");
+
+ if ($secondary_plugin_name eq "HTMLPlugin") {
+ # pdftohtml always produces utf8 - What about pdfbox???
+ # push(@$specific_options, "-input_encoding", "utf8");
+ push(@$specific_options, "-extract_language") if $self->{'extract_language'};
+ push(@$specific_options, "-processing_tmp_files");
+ # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj)
+ # to extract these metadata fields from the HEAD META fields
+ if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) {
+ push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'});
+ } else {
+ push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author
");
+ }
+ if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) {
+ push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'});
+ }
+ if ($self->{'use_sections'} || $self->{'description_tags'}) {
+ $self->{'description_tags'} = 1;
+ push(@$specific_options, "-description_tags");
+ }
+ if ($self->{'use_realistic_book'}) {
+ push(@$specific_options, "-use_realistic_book");
+ }
+ if($self->{'convert_to'} eq "paged_pretty_html") { # for paged pretty html, the default should be to sectionalise
+ # the single superpage, the one containing divs representing individual pages as sections, on headings
+ push(@$specific_options, "sectionalise_using_h_tags");
+ }
+ }
+ elsif ($secondary_plugin_name eq "PagedImagePlugin") {
+ push(@$specific_options, "-screenviewsize", "1000");
+ push(@$specific_options, "-enable_cache");
+ push(@$specific_options, "-processing_tmp_files");
+ }
+
+ $self = bless $self, $class;
+ $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists);
+ return $self;
+}
+
+sub get_default_process_exp {
+ my $self = shift (@_);
+
+ return q^(?i)\.pdf$^;
+}
+
+# so we don't inherit HTMLPlug's block exp...
+sub get_default_block_exp {
+ return "";
+}
+
+sub init {
+ my $self = shift (@_);
+
+ # ConvertBinaryFile init
+ $self->SUPER::init(@_);
+ $self->PDFBoxConverter::init(@_);
+
+ # *****
+ $self->AutoLoadConverters::init(@_); # need verbosity set in $self
+
+}
+
+sub begin {
+ my $self = shift (@_);
+
+ $self->PDFBoxConverter::begin(@_);
+ $self->SUPER::begin(@_);
+
+}
+
+sub deinit {
+ my $self = shift (@_);
+
+ $self->PDFBoxConverter::deinit(@_);
+ $self->SUPER::deinit(@_);
+
+}
+
+# By setting hashing to be on ga xml this ensures that two
+# PDF files that are identical except for the metadata
+# to hash to different values. Without this, when each PDF
+# file is converted to HTML there is a chance that they
+# will both be *identical* if the conversion utility does
+# not embed the metadata in the generated HTML. This is
+# certainly the case when PDFBOX is being used.
+
+# This change makes this convert to based plugin more
+# consistent with the original vision that the same document
+# with different metadata should
+# be seen as different.
+
+sub get_oid_hash_type {
+ my $self = shift (@_);
+ return "hash_on_ga_xml";
+}
+
+
+sub tmp_area_convert_file {
+
+ my $self = shift (@_);
+ my ($output_ext, $input_filename, $textref) = @_;
+
+ if($self->{'convert_to'} eq "text" || $self->{'convert_to'} =~ m/pretty_html$/) { # use xpdftools
+ return $self->ConvertBinaryFile::tmp_area_convert_file(@_);
+ }
+
+ # for all other output formats, use pdfbox:
+
+ # Here, we now do directly what AutoLoadConverters::tmp_area_convert_file(@_)
+ # does with PDFBoxConverter:
+ my ($result, $result_str, $new_filename) = $self->PDFBoxConverter::convert($input_filename, $output_ext);
+ if (defined $result && $result != 0) {
+ return $new_filename;
+ }
+ my $outhandle=$self->{'outhandle'};
+ print $outhandle "PDFBoxConverter had a conversion error\n";
+ print $outhandle "$@\n";
+ if (defined $result_str) {
+ print $outhandle "$result_str\n";
+ }
+ return "";
+}
+
+# Overriding to do some extra handling for pretty_html/paged_pretty_html output mode
+sub run_conversion_command {
+ my $self = shift (@_);
+ my ($tmp_dirname, $tmp_inputPDFname, $utf8_tailname, $lc_suffix, $tailname, $suffix) = @_;
+
+ if($self->{'convert_to'} !~ m/pretty_html$/) {
+ return $self->ConvertBinaryFile::run_conversion_command(@_);
+ }
+
+ # else, paged_pretty_html or pretty_html
+
+ # if output mode is (paged_)pretty_html, we use Xpdf tools' pdftohtml and tell it
+ # to create a subdir called "pages" in the tmp area to puts its products
+ # in there. (Xpdf's pdftohtml needs to be passed a *non-existent* directory
+ # parameter, the "pages" subdir). If Xpdf's pdftohtml has successfully run,
+ # the intermediary output file tmp//pages/index.html should
+ # exist (besides other output products there)
+
+ # We let ConvertBinaryFile proceed normally, but the return value should reflect
+ # that on success it should expect the intermediary product tmpdir/pages/index.html
+ # (which is the product of xpdftohtml conversion).
+ print STDERR "**** mainline: running command ", join(" ",@_), "\n";
+
+ my $output_filename = $self->ConvertBinaryFile::run_conversion_command(@_);
+ $output_filename = &FileUtils::filenameConcatenate($tmp_dirname, "pages", "index.html");
+
+ # However, when convert_post_process() is done, it should have output the final
+ # product of the (paged_)pretty_html conversion: an html file of the same name and in the
+ # same tmp location as the input PDF file.
+
+ my ($name_prefix, $output_dir, $ext)
+ = &File::Basename::fileparse($tmp_inputPDFname, "\\.[^\\.]+\$");
+ $self->{'conv_filename_after_post_process'} = &FileUtils::filenameConcatenate($output_dir, $name_prefix.".html");
+# print STDERR "@@@@@ final paged html file will be: " . $self->{'conv_filename_after_post_process'} . "\n";
+
+ return $output_filename;
+}
+
+sub convert_post_process
+{
+ my $self = shift (@_);
+ my ($conv_filename) = @_;
+
+ my $outhandle=$self->{'outhandle'};
+
+ if($self->{'convert_to'} =~ /pretty_html/) { # (paged_)pretty_html
+ # special post-processing for (paged_)pretty_html mode, as HTML pages generated
+ # by xpdf's pdftohtml need to be massaged into the form we want
+ $self->xpdftohtml_convert_post_process($conv_filename);
+ }
+ else { # use original PDFPlugin's usual post processing
+ $self->default_convert_post_process($conv_filename);
+ }
+}
+
+# Called after gsConvert.pl has been run to convert a PDF to (paged_)pretty_html
+# using Xpdftools' pdftohtml
+# This method will do some cleanup of the HTML files produced after XPDF has produced
+# an HTML doc for each PDF page: it first gets rid of the default index.html.
+# Instead, it constructs a single html page containing each original HTML page
+# nested as divs instead, with simple section information inserted at the top
+# of each 'page' and some further styling customisation. This HTML manipulation
+# is to be done with the Mojo::DOM perl package.
+# Note that since xpdf's pdftohtml would have failed if the output dir already
+# existed and for simpler naming, the output files are created in a new "pages"
+# subdirectory of the tmp location parent of $conv_filename instead
+sub xpdftohtml_convert_post_process
+{
+ my $self = shift (@_);
+ my ($pages_index_html) = @_; # = tmp//pages/index.html for (paged_)pretty_html output mode
+ my $output_filename = $self->{'conv_filename_after_post_process'};
+
+ # Read in all the html files in tmp's "pages" subdir, except for index.html.
+ # and use it to create a new html file called $self->{'conv_filename_after_post_process'}
+ # which will consist of a slightly modified version of
+ # each of the other html files concatenated together.
+
+ my $outhandle=$self->{'outhandle'};
+
+ my ($tailname, $pages_subdir, $suffix)
+ = &File::Basename::fileparse($pages_index_html, "\\.[^\\.]+\$");
+
+ # Code from util::create_itemfile()
+ # Read in all the files
+ opendir(DIR, $pages_subdir) || die "can't opendir $pages_subdir: $!";
+ my @page_files = grep {-f "$pages_subdir/$_"} readdir(DIR);
+ closedir DIR;
+ # Sort files in the directory by page_num
+ # files are named index.html, page1.html, page2.html, ..., pagen.html
+ sub page_number {
+ my ($dir) = @_;
+ my ($pagenum) =($dir =~ m/^page(\d+)\.html?$/i);
+ $pagenum = 0 unless defined $pagenum; # index.html will be given pagenum=0
+ return $pagenum;
+ }
+ # sort the files in the directory in the order of page_num rather than lexically.
+ @page_files = sort { page_number($a) <=> page_number($b) } @page_files;
+
+ #my $num_html_pages = (scalar(@page_files) - 1)/2; # skip index file.
+ # For every html file there's an img file, so halve the total num.
+ # What about other file types that may potentially be there too???
+ my $num_html_pages = 0;
+ foreach my $pagefile (@page_files) {
+ $num_html_pages++ if $pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i;
+ }
+
+ # Prepare to create our new html page that will contain all the individual
+ # htmls generated by xpdf's pdftohtml in sequence.
+ # First write the opening html tags out to the output file. These are the
+ # same tags and their contents, including , as is generated by
+ # Xpdf's pdftohtml for each of its individual html pages.
+ my $start_text = "\n\n";
+ my ($output_tailname, $tmp_subdir, $html_suffix)
+ = &File::Basename::fileparse($output_filename, "\\.[^\\.]+\$");
+ $start_text .= "$output_tailname\n";
+ $start_text .= "\n";
+ $start_text .= "\n\n\n";
+
+ if($self->{'convert_to'} =~ /paged_pretty_html/) { # then add the tags for sectionalising
+ $start_text .= "$output_tailname
\n\n";
+ }
+
+ #handle content encodings the same way that default_convert_post_process does
+ # $self->utf8_write_file ($start_text, $conv_filename); # will close file after write
+ # Don't want to build a giant string in memory of all the pages concatenated
+ # and then write it out in one go. Instead, build up the final single page
+ # by writing each modified (paged_)pretty_html file out to it as this is processed.
+ # Copying file open/close code from CommonUtil::utf8_write_file()
+ if (!open (OUTFILE, ">:utf8", $output_filename)) {
+ &gsprintf::gsprintf(STDERR, "PDFv2Plugin::xpdftohtml_convert_post_process {CommonUtil.could_not_open_for_writing} ($!)\n", $output_filename);
+ die "\n";
+ }
+ print OUTFILE $start_text;
+
+ # Get the contents of each individual HTML page generated by Xpdf, after first
+ # modifying each, and write each out into our single all-encompassing html
+ foreach my $pagefile (@page_files) {
+ if ($pagefile =~ m/\.html?$/ && $pagefile !~ /^index\.html?/i) {
+ my $page_num = page_number($pagefile);
+ # get full path to pagefile
+ $pagefile = &FileUtils::filenameConcatenate($pages_subdir, $pagefile);
+# print STDERR "@@@ About to process html file $pagefile (num $page_num)\n";
+ my $modified_page_contents = $self->_process_pretty_html_page($pagefile, $page_num, $num_html_pages);
+ print OUTFILE "$modified_page_contents\n\n";
+ }
+ }
+
+ # we've now created a single HTML file by concatenating (a modified version)
+ # of each paged html file
+ print OUTFILE "\n\n"; # write out closing tags
+ close OUTFILE; # done
+
+ # Get rid of all the htm(l) files incl index.html in the associated "pages"
+ # subdir, since we've now processed them all into a single html file
+ # one folder level up and we don't want HTMLPlugin to process all of them next.
+ &FileUtils::removeFilesFiltered($pages_subdir, "\.html?\$"); # no specific whitelist, but blacklist htm(l)
+
+ # now the tmp area should contain a single html file contain all the html pages'
+ # contents in sequence, and a "pages" subdir containing the screenshot images
+ # of each page.
+ # HTMLPlugin will process these further in the plugin pipeline
+}
+
+# For whatever reason, most html don't get printed out in GLI
+# So when debugging, use this function to print them out as [tags] instead.
+sub _debug_print_html
+{
+ my $self = shift (@_);
+ my ($string_or_dom) = @_;
+
+ # can't seem to determine type of string with ref/reftype
+ # https://stackoverflow.com/questions/1731333/how-do-i-tell-what-type-of-value-is-in-a-perl-variable
+ # Not needed, as $dom objects seem to get correctly stringified in string contexts
+ # $dom.to_string/$dom.stringify seem to get called, no need to call them
+ # https://stackoverflow.com/questions/5214543/what-is-stringification-in-perl
+ my $escapedTxt = $string_or_dom;
+ $escapedTxt =~ s@\<@[@sg;
+ $escapedTxt =~ s@\>@]@sg;
+
+ print STDERR "#### $escapedTxt\n";
+}
+
+# Helper function for (paged_)pretty_html
+# to read in each page of pretty_html generated by Xpdf's pdftohtml
+# then modify the html suitably using the HTML parsing functions offered by
+# Mojo::DOM, then return the modified HTML content as a string.
+# For paged_pretty_html, some additional modification is done to sectionalise the final html
+# See https://mojolicious.org/perldoc/Mojo/DOM
+sub _process_pretty_html_page
+{
+ my $self = shift (@_);
+ my ($pagefile, $page_num, $num_html_pages) = @_;
+
+ my $text = "";
+
+ # handling content encoding the same way default_convert_post_process does
+ $self->read_file ($pagefile, "utf8", "", \$text);
+
+ my $dom = Mojo::DOM->new($text);
+
+# $self->_debug_print_html($dom);
+
+ # there's a