Changeset 3720


Ignore:
Timestamp:
2003-01-28T16:22:26+13:00 (21 years ago)
Author:
sjboddie
Message:

Added options to PDFPlug to take advantage of the improvements in
version 0.34 of pdftohtml. It now works much better for non latin
input documents (producing UTF-8 encoded HTML).

Location:
trunk/gsdl
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/gsConvert.pl

    r3538 r3720  
    5959
    6060my $use_strings;
     61my $pdf_complex;
     62my $pdf_zoom;
     63my $pdf_ignore_images;
    6164
    6265sub print_usage
     
    7073    print STDERR "\t-output\thtml|text\n";
    7174    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
    72     print STDERR "\t-use_strings\t(use strings to extract text if conversion fails)\n";
     75    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
     76    print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
     77    print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
     78    print STDERR "\t\tconverting PDF to HTML\n";
     79    print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
     80    print STDERR "\t\t-pdf_complex is set\n";
    7381    exit(1);
    7482}
     
    8997             'timeout/\d+/0',\$timeout,
    9098             'verbose/\d+/0',   \$verbose,
    91              'use_strings', \$use_strings))
     99             'use_strings', \$use_strings,
     100             'pdf_complex', \$pdf_complex,
     101             'pdf_zoom/\d+/2', \$pdf_zoom
     102             ))
    92103    {
    93104    print_usage();
     
    628639    $cmd = "";
    629640    if ($timeout) {$cmd = "ulimit -t $timeout;";}
    630     $cmd .= "perl -S pdftohtml.pl ";
     641    $cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
     642    $cmd .= " -c" if ($pdf_complex);
     643    $cmd .= " -i" if ($pdf_ignore_images);
    631644    $cmd .= " \"$input_filename\" \"$output_filestem\"";
    632645   
  • trunk/gsdl/bin/script/pdftohtml.pl

    r3522 r3720  
    4545# note - we don't actually ever use most of these options...
    4646print STDERR 
    47     ("pdftohtml.pl wrapper for pdftohtml version 0.22, modified for GSDL use.\n",
     47    ("pdftohtml.pl wrapper for pdftohtml.\n",
    4848     "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
    4949     "Options:\n",
    5050     "\t-i\tignore images (don't extract)\n",
    51      "\t-a\tallow images only (continue even if no text is present)\n"
     51     "\t-a\tallow images only (continue even if no text is present)\n",
     52     "\t-c\tproduce complex output (requires ghostscript)\n",
     53     "\t-zoom\tfactor by which to zoom the PDF (only useful if -c is set)\n"
    5254     );
    5355exit (1);
     
    5658sub main {
    5759    my (@ARGV) = @_;
    58     my ($allow_no_text,$ignore_images);
     60    my ($allow_no_text, $ignore_images, $complex, $zoom);
    5961   
    6062    # read command-line arguments so that
     
    6264    if (!parsargv::parse(\@ARGV,
    6365             'a', \$allow_no_text,
    64              'i', \$ignore_images
     66             'i', \$ignore_images,
     67             'c', \$complex,
     68             'zoom/\d+/2', \$zoom,
    6569             ))
    6670    {
     
    110114
    111115    $cmd .= " -i" if ($ignore_images);
    112     $cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\"";
     116    $cmd .= " -c" if ($complex);
     117    $cmd .= " -zoom $zoom";
     118    $cmd .= " -noframes -p -enc UTF-8 \"$input_filename\" \"$output_filestem.html\"";
    113119
    114120# system() returns -1 if it can't run, otherwise it's $cmds ret val.
     
    187193    $directory =~ s@[^\/]*$@@;    # assume filename has no embedded slashes...
    188194
    189     open (IMAGES, "${directory}images.log") ||
    190     open (IMAGES, "${directory}image.log") ||
    191           print STDERR "Error opening image log:$!\n";
    192     while (<IMAGES>) {
    193     push (@images, $_);
    194     }
    195     close IMAGES;
    196     &util::rm("${directory}image.log") if (-e "${directory}image.log");
     195    if (open (IMAGES, "${directory}images.log") ||
     196    open (IMAGES, "${directory}image.log")) {
     197    while (<IMAGES>) {
     198        push (@images, $_);
     199    }
     200    close IMAGES;
     201    &util::rm("${directory}image.log") if (-e "${directory}image.log");
     202
     203    }
    197204
    198205    # no need to go any further if there is no text extracted from pdf.
  • trunk/gsdl/perllib/plugins/ConvertToPlug.pm

    r3540 r3720  
    204204    my $output_type = lc($convert_to);
    205205    my $cmd = "perl -S gsConvert.pl -verbose $verbosity ";
     206    if (defined $self->{'convert_options'}) {
     207    $cmd .= $self->{'convert_options'} . " ";
     208    }
    206209    if ($self->{'use_strings'}) {
    207210      $cmd .= "-use_strings ";
    208211    }
    209212    $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
     213
    210214    $output_type = `$cmd`;
    211215
  • trunk/gsdl/perllib/plugins/PDFPlug.pm

    r3614 r3720  
    4040            'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.",
    4141            'type' => 'string',
    42             'deft' =>  q^^ }
     42            'deft' =>  q^^ },
     43          { 'name' => "noimages",
     44            'desc' =>  "Don't attempt to extract images from PDF.",
     45            'type' => "flag" },
     46          { 'name' => "complex",
     47            'desc' =>  "Create more complex output. With this option set the output html will look much more like the original PDF file. For this to function properly you Ghostscript installed (for *nix gs should be on your path while for windows you must have gswin32c.exe on your path).",
     48            'type' => "flag" },
     49          { 'name' => "zoom",
     50            'desc' =>  "The factor by which to zoomthe PDF for output (this is only useful if -complex is set).",
     51            'type' => "int" }
    4352          ];
    4453
     
    5059sub new {
    5160    my $class = shift (@_);
     61
     62    my ($noimages, $complex, $zoom);
     63   
     64    if (!parsargv::parse(\@_,
     65             q^noimages^, \$noimages,
     66             q^complex^, \$complex,
     67             q^zoom/\d+/2^, \$zoom,
     68             "allow_extra_options")) {
     69
     70    print STDERR "\nIncorrect options passed to PDFPlug, check your collect.cfg configuration file\n";
     71    &print_usage();
     72    die "\n";
     73    }
    5274
    5375    # following title_sub removes "Page 1" added by pdftohtml, and a leading
     
    81103    my $option_list = $self->{'option_list'};
    82104    push( @{$option_list}, $options );
     105
     106    # these are passed through to gsConvert.pl by ConvertToPlug.pm
     107    $self->{'convert_options'} = "-pdf_zoom $zoom";
     108    $self->{'convert_options'} .= " -pdf_complex" if $complex;
     109    $self->{'convert_options'} .= " -pdf_ignore_images" if $noimages;
     110
     111    # pdftohtml will always produce html files encoded as utf-8
     112    if ($self->{'input_encoding'} eq "auto") {
     113    $self->{'input_encoding'} = "utf8";
     114    $self->{'extract_language'} = 1;
     115    }
    83116   
    84117    return bless $self, $class;
     
    89122    print STDERR "\n  usage: plugin PDFPlug [options]\n\n";
    90123    print STDERR "  options:\n";
    91     print STDERR "   -convert_to (html|text) plugin converts to TEXT or HTML\n";
    92     print STDERR "                           (default html)\n";
    93     print STDERR "   -use_sections     create a separate section for each page of the PDF file.\n\n";
     124    print STDERR "   -convert_to (html|text) Convert to TEXT or HTML (default html)\n";
     125    print STDERR "   -use_sections     Create a separate section for each page\n";
     126    print STDERR "                     of the PDF file.\n";
     127    print STDERR "   -noimages         Don't attempt to extract images from PDF.\n";
     128    print STDERR "   -complex          Create more complex output. With this option\n";
     129    print STDERR "                     set the output html will look much more like\n";
     130    print STDERR "                     the original PDF file. For this to function\n";
     131    print STDERR "                     properly you Ghostscript installed (for *nix\n";
     132    print STDERR "                     gs should be on your path while for windows\n";
     133    print STDERR "                     you must have gswin32c.exe on your path).\n";
     134    print STDERR "   -zoom             The factor by which to zoomthe PDF for output\n";
     135    print STDERR "                     (this is only useful if -complex is set).\n\n";
    94136}
    95137
Note: See TracChangeset for help on using the changeset viewer.