########################################################################### # # PDFv1Plugin.pm -- The older pdf plugin, which uses the older pdftohtml # tool that can't handle newer versions of PDFs. # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999-2018 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package PDFv1Plugin; use strict; no strict 'refs'; # so we can use a var for filehandles (e.g. STDERR) no strict 'subs'; # allow filehandles to be variables and viceversa use ConvertBinaryFile; use ReadTextFile; use unicode; @PDFv1Plugin::ISA = ('ConvertBinaryFile', 'ReadTextFile'); # PDFv1 plugin should be returned to being more like it was before AutoLoadConverters/PDFBox extension's inclusion # like the PDFPlugin was at http://trac.greenstone.org/browser/main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm?rev=22597 my $convert_to_list = [ { 'name' => "auto", 'desc' => "{ConvertBinaryFile.convert_to.auto}" }, { 'name' => "html", 'desc' => "{ConvertBinaryFile.convert_to.html}" }, { 'name' => "text", 'desc' => "{ConvertBinaryFile.convert_to.text}" }, { 'name' => "pagedimg_jpg", 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_jpg}"}, { 'name' => "pagedimg_gif", 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_gif}"}, { 'name' => "pagedimg_png", 'desc' => "{ConvertBinaryFile.convert_to.pagedimg_png}"}, ]; my $arguments = [ { 'name' => "convert_to", 'desc' => "{ConvertBinaryFile.convert_to}", 'type' => "enum", 'reqd' => "yes", 'list' => $convert_to_list, 'deft' => "html" }, { 'name' => "process_exp", 'desc' => "{BaseImporter.process_exp}", 'type' => "regexp", 'deft' => &get_default_process_exp(), 'reqd' => "no" }, { 'name' => "block_exp", 'desc' => "{CommonUtil.block_exp}", 'type' => "regexp", 'deft' => &get_default_block_exp() }, { 'name' => "metadata_fields", 'desc' => "{HTMLPlugin.metadata_fields}", 'type' => "string", 'deft' => "Title,Author,Subject,Keywords" }, { 'name' => "metadata_field_separator", 'desc' => "{HTMLPlugin.metadata_field_separator}", 'type' => "string", 'deft' => "" }, { 'name' => "noimages", 'desc' => "{PDFPlugin.noimages}", 'type' => "flag" }, { 'name' => "allowimagesonly", 'desc' => "{PDFPlugin.allowimagesonly}", 'type' => "flag" }, { 'name' => "complex", 'desc' => "{PDFPlugin.complex}", 'type' => "flag" }, { 'name' => "nohidden", 'desc' => "{PDFPlugin.nohidden}", 'type' => "flag" }, { 'name' => "zoom", 'desc' => "{PDFv1Plugin.zoom}", 'deft' => "2", 'range' => "1,3", # actually the range is 0.5-3 'type' => "int" }, { 'name' => "use_sections", 'desc' => "{PDFPlugin.use_sections}", 'type' => "flag" }, { 'name' => "description_tags", 'desc' => "{HTMLPlugin.description_tags}", 'type' => "flag" }, { 'name' => "use_realistic_book", 'desc' => "{PDFPlugin.use_realistic_book}", 'type' => "flag"} ]; my $options = { 'name' => "PDFv1Plugin", 'desc' => "{PDFPlugin.desc}", 'abstract' => "no", 'inherits' => "yes", 'srcreplaceable' => "yes", # Source docs in PDF can be replaced with GS-generated html 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); push(@$inputargs,"-title_sub"); push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?'); push(@{$hashArgOptLists->{"ArgList"}},@{$arguments}); push(@{$hashArgOptLists->{"OptList"}},$options); my $self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists); if ($self->{'info_only'}) { # don't worry about any options etc return bless $self, $class; } $self = bless $self, $class; $self->{'file_type'} = "PDF"; # these are passed through to gsConvert.pl by ConvertBinaryFile.pm my $zoom = $self->{"zoom"}; $self->{'convert_options'} = "-pdf_tool pdftohtml"; # PDFPluginv1 only ever uses the old pdftohtml conversion tool $self->{'convert_options'} .= " -pdf_zoom $zoom"; $self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"}; $self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"}; $self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"}; $self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"}; # check convert_to if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) { &gsprintf::gsprintf(STDERR, "{PDFv1Plugin.win_old_pdftotext_unsupported}\n"); $self->{'convert_to'} = "html"; } elsif ($self->{'convert_to'} eq "auto") { # choose html ?? is this the best option $self->{'convert_to'} = "html"; } if ($self->{'use_realistic_book'}) { if ($self->{'convert_to'} ne "html") { print STDERR "PDFs will be converted to HTML for realistic book functionality\n"; $self->{'convert_to'} = "html"; } } # set convert_to_plugin and convert_to_ext $self->set_standard_convert_settings(); my $secondary_plugin_name = $self->{'convert_to_plugin'}; my $secondary_plugin_options = $self->{'secondary_plugin_options'}; if (!defined $secondary_plugin_options->{$secondary_plugin_name}) { $secondary_plugin_options->{$secondary_plugin_name} = []; } my $specific_options = $secondary_plugin_options->{$secondary_plugin_name}; # following title_sub removes "Page 1" added by pdftohtml, and a leading # "1", which is often the page number at the top of the page. Bad Luck # if your document title actually starts with "1 " - is there a better way? push(@$specific_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); my $associate_tail_re = $self->{'associate_tail_re'}; if ((defined $associate_tail_re) && ($associate_tail_re ne "")) { push(@$specific_options, "-associate_tail_re", $associate_tail_re); } push(@$specific_options, "-file_rename_method", "none"); if ($secondary_plugin_name eq "HTMLPlugin") { # pdftohtml always produces utf8 push(@$specific_options, "-input_encoding", "utf8"); push(@$specific_options, "-extract_language") if $self->{'extract_language'}; push(@$specific_options, "-processing_tmp_files"); # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj) # to extract these metadata fields from the HEAD META fields if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) { push(@$specific_options,"-metadata_fields",$self->{'metadata_fields'}); } else { push(@$specific_options,"-metadata_fields","Title,GENERATOR,date,author"); } if (defined $self->{'metadata_field_separator'} && $self->{'metadata_field_separator'} =~ /\S/) { push(@$specific_options,"-metadata_field_separator",$self->{'metadata_field_separator'}); } if ($self->{'use_sections'} || $self->{'description_tags'}) { $self->{'description_tags'} = 1; push(@$specific_options, "-description_tags"); } if ($self->{'use_realistic_book'}) { push(@$specific_options, "-use_realistic_book"); } } elsif ($secondary_plugin_name eq "PagedImagePlugin") { push(@$specific_options, "-screenviewsize", "1000"); push(@$specific_options, "-enable_cache"); push(@$specific_options, "-processing_tmp_files"); } $self = bless $self, $class; # Q TODO: why does it do this a 2nd time in this function? $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists); return $self; } sub get_default_process_exp { my $self = shift (@_); return q^(?i)\.pdf$^; } # so we don't inherit HTMLPlug's block exp... sub get_default_block_exp { return ""; } # By setting hashing to be on ga xml this ensures that two # PDF files that are identical except for the metadata # to hash to different values. Without this, when each PDF # file is converted to HTML there is a chance that they # will both be *identical* if the conversion utility does # not embed the metadata in the generated HTML. This is # certainly the case when PDFBOX is being used. # This change makes this convert to based plugin more # consistent with the original vision that the same document # with different metadata should # be seen as different. sub get_oid_hash_type { my $self = shift (@_); return "hash_on_ga_xml"; } #sub tmp_area_convert_file { # # my $self = shift (@_); # return $self->AutoLoadConverters::tmp_area_convert_file(@_); # #} sub convert_post_process { my $self = shift (@_); my ($conv_filename) = @_; my $outhandle=$self->{'outhandle'}; #$self->{'input_encoding'} = "utf8"; # The output is always in utf8 (is it?? it is for html, but what about other types?) #my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename); # read in file ($text will be in utf8) my $text = ""; # encoding will be utf8 for html files - what about other types? will we do this step for them anyway? $self->read_file ($conv_filename, "utf8", "", \$text); # Clean html from low and high surrogates D800–DFFF $text =~ s@[\N{U+D800}-\N{U+DFFF}]@\ @g; # Calculate number of pages based on tags (we have a etc # for each page). Metadata based on this calculation not set until process() # # Note: this is done even if we are not breaking the document into pages as it might # be useful to give an indication of document length in browser through setting # num_pages as metadata. my @pages = ($text =~ m/\<[Aa] name=\"?\w+\"?>/ig); #
my $num_pages = scalar(@pages); $self->{'num_pages'} = $num_pages; if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") { print $outhandle "PDFPlugin: Calculating sections...\n"; # we have "" etc for each page # it may be {'title_sub'}); $title =~ s/^\s+//s; # in case title_sub introduced any... $title = substr ($title, 0, 100); $title =~ s/\s\S*$/.../; if (scalar (@sections) == 1) { # no sections found $top_section .= $sections[0]; @sections=(); } else { $top_section .= "\n \n"; } # add metadata per section... foreach my $section (@sections) { # section names are not always just digits, may be like "outline" $section =~ s@^\"?(\w+)\"?>@@; # leftover from split expression... $title = $1; # Greenstone does magic if sections are titled digits if (! defined($title) ) { print STDERR "no title: $section\n"; $title = " "; # get rid of the undefined warning in next line } my $newsection = "\n
\n"; #TODO: . "\n-->

\n"; $newsection .= $section; $newsection .= "\n"; $section = $newsection; } $text=join('', ($top_section, @sections)); } if ($self->{'use_sections'} && $self->{'converted_to'} eq "text") { print STDERR "**** When converting PDF to text, cannot apply use_sections\n"; } # The following should no longer be needed, now that strings # read in are Unicode aware (in the Perl sense) rather than # raw binary strings that just happen to be UTF-8 compliant # turn any high bytes that aren't valid utf-8 into utf-8. ## unicode::ensure_utf8(\$text); # Write it out again! $self->utf8_write_file (\$text, $conv_filename); } # do plugin specific processing of doc_obj for HTML type sub process { my $self = shift (@_); my ($pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; my $result = $self->process_type($base_dir,$file,$doc_obj); # fix up the extracted date metadata to be in Greenstone date format, # and fix the capitalisation of 'date' my $cursection = $doc_obj->get_top_section(); foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) { $doc_obj->delete_metadata($cursection, "date", $datemeta); # We're just interested in the date bit, not the time # some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows") # set a /CreationDate, and set /ModDate to 000000000. pdftohtml # extracts the ModDate, so it is 0... $datemeta =~ /(\d+)-(\d+)-(\d+)/; my ($year, $month, $day) = ($1,$2,$3); if (defined($year) && defined($month) && defined($day)) { if ($year == 0) {next} if ($year < 100) {$year += 1900} # just to be safe if ($month =~ /^\d$/) {$month="0$month"} # single digit if ($day =~ /^\d$/) {$day="0$day"} # single digit my $date="$year$month$day"; $doc_obj->add_utf8_metadata($cursection, "Date", $date); } } $doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'}) if defined $self->{'num_pages'}; if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") { # For gs2 we explicitly make it a paged document, cos greenstone won't get it # right if any section has an empty title, or one with letters in it if (&util::is_gs3()) { # but for gs3, paged docs currently use image slider which is ugly if there are no images $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Hierarchy"); } else { $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged"); } } return $result; } 1;