########################################################################### # # PDFPlug.pm -- reasonably with-it pdf plugin # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999-2001 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package PDFPlug; use ConvertToPlug; sub BEGIN { @ISA = ('ConvertToPlug'); } my $arguments = [ { 'name' => "process_exp", 'desc' => "{BasPlug.process_exp}", 'type' => "regexp", 'deft' => &get_default_process_exp(), 'reqd' => "no" }, { 'name' => "block_exp", 'desc' => "{BasPlug.block_exp}", 'type' => "regexp", 'deft' => &get_default_block_exp() }, { 'name' => "noimages", 'desc' => "{PDFPlug.noimages}", 'type' => "flag" }, { 'name' => "complex", 'desc' => "{PDFPlug.complex}", 'type' => "flag" }, { 'name' => "nohidden", 'desc' => "{PDFPlug.nohidden}", 'type' => "flag" }, { 'name' => "zoom", 'desc' => "{PDFPlug.zoom}", 'deft' => "2", 'range' => "1,3", # actually the range is 0.5-3 'type' => "int" }, { 'name' => "use_sections", 'desc' => "{PDFPlug.use_sections}", 'type' => "flag" } ]; my $options = { 'name' => "PDFPlug", 'desc' => "{PDFPlug.desc}", 'abstract' => "no", 'inherits' => "yes", 'args' => $arguments }; sub new { my $class = shift (@_); my ($noimages, $complex, $zoom, $use_sections, $nohidden); my @args=@_; if (!parsargv::parse(\@_, q^noimages^, \$noimages, q^complex^, \$complex, q^zoom/\d+/2^, \$zoom, q^nohidden^, \$nohidden, q^use_sections^, \$use_sections, "allow_extra_options")) { local $self = new ConvertToPlug($class, @_, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); my $outhandle=$self->{'outhandle'}; print $outhandle "\nIncorrect options passed to PDFPlug, check your collect.cfg configuration file\n"; $self->print_txt_usage(""); # Use default resource bundle exit 1; } if ($use_sections) { push (@args, "-description_tags"); } # following title_sub removes "Page 1" added by pdftohtml, and a leading # "1", which is often the page number at the top of the page. Bad Luck # if your document title actually starts with "1 " - is there a better way? my $self = new ConvertToPlug ($class, @args, "-title_sub", '^(Page\s+\d+)?(\s*1\s+)?'); $self->{'plugin_type'} = "PDFPlug"; if ($use_sections) { $self->{'use_sections'}=1; } # 14-05-02 To allow for proper inheritance of arguments - John Thompson my $option_list = $self->{'option_list'}; push( @{$option_list}, $options ); # these are passed through to gsConvert.pl by ConvertToPlug.pm $self->{'convert_options'} = "-pdf_zoom $zoom"; $self->{'convert_options'} .= " -pdf_complex" if $complex; $self->{'convert_options'} .= " -pdf_nohidden" if $nohidden; $self->{'convert_options'} .= " -pdf_ignore_images" if $noimages; # pdftohtml will always produce html files encoded as utf-8 if ($self->{'input_encoding'} eq "auto") { $self->{'input_encoding'} = "utf8"; $self->{'extract_language'} = 1; } return bless $self, $class; } # sub print_usage { # print STDERR "\n usage: plugin PDFPlug [options]\n\n"; # print STDERR " options:\n"; # print STDERR " -convert_to (html|text) Convert to TEXT or HTML (default html)\n"; # print STDERR " -use_sections Create a separate section for each page\n"; # print STDERR " of the PDF file.\n"; # print STDERR " -noimages Don't attempt to extract images from PDF.\n"; # print STDERR " -complex Create more complex output. With this option\n"; # print STDERR " set the output html will look much more like\n"; # print STDERR " the original PDF file. For this to function\n"; # print STDERR " properly you Ghostscript installed (for *nix\n"; # print STDERR " gs should be on your path while for windows\n"; # print STDERR " you must have gswin32c.exe on your path).\n"; # print STDERR " -nohidden Prevent pdftohtml from attempting to extract\n"; # print STDERR " hidden text. This is only useful if the -complex\n"; # print STDERR " option is also set."; # print STDERR " -zoom The factor by which to zoomthe PDF for output\n"; # print STDERR " (this is only useful if -complex is set).\n\n"; # } sub get_default_process_exp { my $self = shift (@_); return q^(?i)\.pdf$^; } # so we don't inherit HTMLPlug's block exp... sub get_default_block_exp { return ""; } # do plugin specific processing of doc_obj for HTML type sub process { my $self = shift (@_); my $outhandle=$self->{'outhandle'}; if ($self->{'use_sections'} && $self->{'converted_to'} eq "HTML") { print $outhandle "PDFPlug: Calculating sections...\n"; my $textref=$_[0]; # we have "" etc for each page my @sections = split('//; # specific for pdftohtml... $title =~ s/<\/([^>]+)><\1>//g; # (eg) - no space $title =~ s/<[^>]*>/ /g; $title =~ s/(?: |\xc2\xa0)/ /g; # utf-8 for nbsp... $title =~ s/^\s+//s; $title =~ s/\s+$//; $title =~ s/\s+/ /gs; $title =~ s/^$self->{'title_sub'}// if ($self->{'title_sub'}); $title =~ s/^\s+//s; # in case title_sub introduced any... $title = substr ($title, 0, 100); $title =~ s/\s\S*$/.../; my $top_section; if (scalar (@sections) == 1) { # no sections found $top_section=$sections[0]; @sections=(); } else { $top_section = "\n \n"; } # add metadata per section... foreach my $section (@sections) { $section =~ s@^(\d+)>@@; # leftover from split expression... $title = $1; # Greenstone does magic if sections are titled digits if (! defined($title) ) { print STDERR "no title: $section\n"; } my $newsection = "\n

\n"; $newsection .= $section; $newsection .= "\n"; $section = $newsection; } $$textref=join('', ($top_section, @sections)); } print $outhandle "PDFPlug: passing $_[3] on to $self->{'converted_to'}Plug\n" if $self->{'verbosity'} > 1; return ConvertToPlug::process_type($self,"pdf",@_); } 1;