########################################################################### # # PDFPlug.pm -- reasonably with-it pdf plugin # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999-2001 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package PDFPlug; use ConvertToPlug; use unicode; use strict; no strict 'refs'; # so we can use a var for filehandles (eg STDERR) sub BEGIN { @PDFPlug::ISA = ('ConvertToPlug'); } my $convert_to_list = [ { 'name' => "auto", 'desc' => "{ConvertToPlug.convert_to.auto}" }, { 'name' => "html", 'desc' => "{ConvertToPlug.convert_to.html}" }, { 'name' => "text", 'desc' => "{ConvertToPlug.convert_to.text}" }, { 'name' => "pagedimg_jpg", 'desc' => "{ConvertToPlug.convert_to.pagedimg_jpg}"}, { 'name' => "pagedimg_gif", 'desc' => "{ConvertToPlug.convert_to.pagedimg_gif}"}, { 'name' => "pagedimg_png", 'desc' => "{ConvertToPlug.convert_to.pagedimg_png}"}, ]; my $arguments = [ { 'name' => "convert_to", 'desc' => "{ConvertToPlug.convert_to}", 'type' => "enum", 'reqd' => "yes", 'list' => $convert_to_list, 'deft' => "html" }, { 'name' => "process_exp", 'desc' => "{BasPlug.process_exp}", 'type' => "regexp", 'deft' => &get_default_process_exp(), 'reqd' => "no" }, { 'name' => "block_exp", 'desc' => "{BasPlug.block_exp}", 'type' => "regexp", 'deft' => &get_default_block_exp() }, { 'name' => "metadata_fields", 'desc' => "{HTMLPlug.metadata_fields}", 'type' => "string", 'deft' => "" }, { 'name' => "noimages", 'desc' => "{PDFPlug.noimages}", 'type' => "flag" }, { 'name' => "allowimagesonly", 'desc' => "{PDFPlug.allowimagesonly}", 'type' => "flag" }, { 'name' => "complex", 'desc' => "{PDFPlug.complex}", 'type' => "flag" }, { 'name' => "nohidden", 'desc' => "{PDFPlug.nohidden}", 'type' => "flag" }, { 'name' => "zoom", 'desc' => "{PDFPlug.zoom}", 'deft' => "2", 'range' => "1,3", # actually the range is 0.5-3 'type' => "int" }, { 'name' => "use_sections", 'desc' => "{PDFPlug.use_sections}", 'type' => "flag" }, { 'name' => "description_tags", 'desc' => "{HTMLPlug.description_tags}", 'type' => "flag" } ]; my $options = { 'name' => "PDFPlug", 'desc' => "{PDFPlug.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub new { my ($class) = shift (@_); my ($pluginlist,$inputargs,$hashArgOptLists) = @_; push(@$pluginlist, $class); push(@$inputargs,"-title_sub"); push(@$inputargs,'^(Page\s+\d+)?(\s*1\s+)?'); if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});} if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)}; my @arg_array = @$inputargs; my $self = (defined $hashArgOptLists)? new ConvertToPlug($pluginlist,$inputargs,$hashArgOptLists): new ConvertToPlug($pluginlist,$inputargs); if ($self->{'info_only'}) { # don't worry about any options etc return bless $self, $class; } # these are passed through to gsConvert.pl by ConvertToPlug.pm my $zoom = $self->{"zoom"}; $self->{'convert_options'} = "-pdf_zoom $zoom"; $self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"}; $self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"}; $self->{'convert_options'} .= " -pdf_ignore_images" if $self->{"noimages"}; $self->{'convert_options'} .= " -pdf_allow_images_only" if $self->{"allowimagesonly"}; my $secondary_plugin_options = $self->{'secondary_plugin_options'}; if (!defined $secondary_plugin_options->{'HTMLPlug'}) { $secondary_plugin_options->{'HTMLPlug'} = []; } if (!defined $secondary_plugin_options->{'TEXTPlug'}) { $secondary_plugin_options->{'TEXTPlug'} = []; } if (defined $self->{'convert_to'} && $self->{'convert_to'} =~ /pagedimg.*/i) { if (!defined $secondary_plugin_options->{'PagedImgPlug'}){ $secondary_plugin_options->{'PagedImgPlug'} = []; my $pagedimg_options = $secondary_plugin_options->{'PagedImgPlug'}; push(@$pagedimg_options, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); } } my $html_options = $secondary_plugin_options->{'HTMLPlug'}; my $text_options = $secondary_plugin_options->{'TEXTPlug'}; if ($self->{'input_encoding'} eq "auto") { # pdftohtml will always produce html files encoded as utf-8 # => restrict primary PDFPlug and secondary HTML plugin to use # utf8 and extract language. $self->{'input_encoding'} = "utf8"; $self->{'extract_language'} = 1; push(@$html_options,"-extract_language"); } # if pdftohtml is always producing utf8, then htmlplug always needs this option push(@$html_options,"-input_encoding", "utf8"); # Instruct HTMLPlug (when eventually accessed through read_into_doc_obj) # to extract these metadata fields from the HEAD META fields my $required_metadata; if (defined $self->{'metadata_fields'} && $self->{'metadata_fields'} =~ /\S/) { push(@$html_options,"-metadata_fields",$self->{'metadata_fields'}); } else { push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author"); } #push(@$html_options,"-metadata_fields","Title,GENERATOR,date,author"); if ($self->{'use_sections'} || $self->{'description_tags'}) { $self->{'description_tags'} = 1; push(@$html_options,"-description_tags"); } # following title_sub removes "Page 1" added by pdftohtml, and a leading # "1", which is often the page number at the top of the page. Bad Luck # if your document title actually starts with "1 " - is there a better way? push(@$html_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); push(@$text_options , "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); $self = bless $self, $class; $self->load_secondary_plugins($class,$secondary_plugin_options,$hashArgOptLists); return $self; } sub get_default_process_exp { my $self = shift (@_); return q^(?i)\.pdf$^; } # so we don't inherit HTMLPlug's block exp... sub get_default_block_exp { return ""; } sub convert_post_process { my $self = shift (@_); my ($conv_filename) = @_; my $outhandle=$self->{'outhandle'}; my ($language, $encoding) = $self->textcat_get_language_encoding ($conv_filename); # read in file ($text will be in utf8) my $text = ""; $self->read_file ($conv_filename, $encoding, $language, \$text); # Calculate number of pages based on tags (we have a etc # for each page). Metadata based on this calculation not set until process() # # Note: this is done even if we are not breaking to document into pages as it might # be useful to give an indication of document length in browser through setting # num_pages as metadata. my @pages = ($text =~ /\<[Aa] name=\"?\w+\"?>/ig); my $num_pages = scalar(@pages); $self->{'num_pages'} = $num_pages; if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") { print $outhandle "PDFPlug: Calculating sections...\n"; # we have "" etc for each page # it may be {'title_sub'}); $title =~ s/^\s+//s; # in case title_sub introduced any... $title = substr ($title, 0, 100); $title =~ s/\s\S*$/.../; if (scalar (@sections) == 1) { # no sections found $top_section .= $sections[0]; @sections=(); } else { $top_section .= "\n \n"; } # add metadata per section... foreach my $section (@sections) { # section names are not always just digits, may be like "outline" $section =~ s@^\"?(\w+)\"?>@@; # leftover from split expression... $title = $1; # Greenstone does magic if sections are titled digits if (! defined($title) ) { print STDERR "no title: $section\n"; $title = " "; # get rid of the undefined warning in next line } my $newsection = "\n

\n"; $newsection .= $section; $newsection .= "\n"; $section = $newsection; } $text=join('', ($top_section, @sections)); } # turn any high bytes that aren't valid utf-8 into utf-8. unicode::ensure_utf8(\$text); # Write it out again! $self->utf8_write_file (\$text, $conv_filename); } # do plugin specific processing of doc_obj for HTML type sub process { my $self = shift (@_); my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_; my $result = $self->process_type("pdf",$base_dir,$file,$doc_obj); # fix up the extracted date metadata to be in Greenstone date format, # and fix the capitalisation of 'date' my $cursection = $doc_obj->get_top_section(); foreach my $datemeta (@{$doc_obj->get_metadata($cursection, "date")}) { $doc_obj->delete_metadata($cursection, "date", $datemeta); # We're just interested in the date bit, not the time # some pdf creators (eg "Acrobat 5.0 Scan Plug-in for Windows") # set a /CreationDate, and set /ModDate to 000000000. pdftohtml # extracts the ModDate, so it is 0... $datemeta =~ /(\d+)-(\d+)-(\d+)/; my ($year, $month, $day) = ($1,$2,$3); if (defined($year) && defined($month) && defined($day)) { if ($year == 0) {next} if ($year < 100) {$year += 1900} # just to be safe if ($month =~ /^\d$/) {$month="0$month"} # single digit if ($day =~ /^\d$/) {$day="0$day"} # single digit my $date="$year$month$day"; $doc_obj->add_utf8_metadata($cursection, "Date", $date); } } $doc_obj->add_utf8_metadata($cursection, "NumPages", $self->{'num_pages'}); if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") { # we explicitly make it a paged document, cos greenstone won't get it # right if any section has an empty title, or one with letters in it $doc_obj->set_utf8_metadata_element ($cursection, "gsdlthistype", "Paged"); } return $result; } 1;