Changeset 3720
- Timestamp:
- 2003-01-28T16:22:26+13:00 (21 years ago)
- Location:
- trunk/gsdl
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/bin/script/gsConvert.pl
r3538 r3720 59 59 60 60 my $use_strings; 61 my $pdf_complex; 62 my $pdf_zoom; 63 my $pdf_ignore_images; 61 64 62 65 sub print_usage … … 70 73 print STDERR "\t-output\thtml|text\n"; 71 74 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n"; 72 print STDERR "\t-use_strings\t(use strings to extract text if conversion fails)\n"; 75 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n"; 76 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n"; 77 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n"; 78 print STDERR "\t\tconverting PDF to HTML\n"; 79 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n"; 80 print STDERR "\t\t-pdf_complex is set\n"; 73 81 exit(1); 74 82 } … … 89 97 'timeout/\d+/0',\$timeout, 90 98 'verbose/\d+/0', \$verbose, 91 'use_strings', \$use_strings)) 99 'use_strings', \$use_strings, 100 'pdf_complex', \$pdf_complex, 101 'pdf_zoom/\d+/2', \$pdf_zoom 102 )) 92 103 { 93 104 print_usage(); … … 628 639 $cmd = ""; 629 640 if ($timeout) {$cmd = "ulimit -t $timeout;";} 630 $cmd .= "perl -S pdftohtml.pl "; 641 $cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom"; 642 $cmd .= " -c" if ($pdf_complex); 643 $cmd .= " -i" if ($pdf_ignore_images); 631 644 $cmd .= " \"$input_filename\" \"$output_filestem\""; 632 645 -
trunk/gsdl/bin/script/pdftohtml.pl
r3522 r3720 45 45 # note - we don't actually ever use most of these options... 46 46 print STDERR 47 ("pdftohtml.pl wrapper for pdftohtml version 0.22, modified for GSDL use.\n",47 ("pdftohtml.pl wrapper for pdftohtml.\n", 48 48 "Usage: pdftohtml [options] <PDF-file> <html-file>\n", 49 49 "Options:\n", 50 50 "\t-i\tignore images (don't extract)\n", 51 "\t-a\tallow images only (continue even if no text is present)\n" 51 "\t-a\tallow images only (continue even if no text is present)\n", 52 "\t-c\tproduce complex output (requires ghostscript)\n", 53 "\t-zoom\tfactor by which to zoom the PDF (only useful if -c is set)\n" 52 54 ); 53 55 exit (1); … … 56 58 sub main { 57 59 my (@ARGV) = @_; 58 my ($allow_no_text, $ignore_images);60 my ($allow_no_text, $ignore_images, $complex, $zoom); 59 61 60 62 # read command-line arguments so that … … 62 64 if (!parsargv::parse(\@ARGV, 63 65 'a', \$allow_no_text, 64 'i', \$ignore_images 66 'i', \$ignore_images, 67 'c', \$complex, 68 'zoom/\d+/2', \$zoom, 65 69 )) 66 70 { … … 110 114 111 115 $cmd .= " -i" if ($ignore_images); 112 $cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\""; 116 $cmd .= " -c" if ($complex); 117 $cmd .= " -zoom $zoom"; 118 $cmd .= " -noframes -p -enc UTF-8 \"$input_filename\" \"$output_filestem.html\""; 113 119 114 120 # system() returns -1 if it can't run, otherwise it's $cmds ret val. … … 187 193 $directory =~ s@[^\/]*$@@; # assume filename has no embedded slashes... 188 194 189 open (IMAGES, "${directory}images.log") || 190 open (IMAGES, "${directory}image.log") || 191 print STDERR "Error opening image log:$!\n"; 192 while (<IMAGES>) { 193 push (@images, $_); 194 } 195 close IMAGES; 196 &util::rm("${directory}image.log") if (-e "${directory}image.log"); 195 if (open (IMAGES, "${directory}images.log") || 196 open (IMAGES, "${directory}image.log")) { 197 while (<IMAGES>) { 198 push (@images, $_); 199 } 200 close IMAGES; 201 &util::rm("${directory}image.log") if (-e "${directory}image.log"); 202 203 } 197 204 198 205 # no need to go any further if there is no text extracted from pdf. -
trunk/gsdl/perllib/plugins/ConvertToPlug.pm
r3540 r3720 204 204 my $output_type = lc($convert_to); 205 205 my $cmd = "perl -S gsConvert.pl -verbose $verbosity "; 206 if (defined $self->{'convert_options'}) { 207 $cmd .= $self->{'convert_options'} . " "; 208 } 206 209 if ($self->{'use_strings'}) { 207 210 $cmd .= "-use_strings "; 208 211 } 209 212 $cmd .= "-errlog \"$errlog\" -output $output_type \"$tmp_filename\""; 213 210 214 $output_type = `$cmd`; 211 215 -
trunk/gsdl/perllib/plugins/PDFPlug.pm
r3614 r3720 40 40 'desc' => "Files matching this regular expression will be blocked from being passed to any later plugins in the list. This has no real effect other than to prevent lots of warning messages about input files you don't care about. Each plugin might have a default block_exp. e.g. by default HTMLPlug blocks any files with .gif, .jpg, .jpeg, .png or .css file extensions.", 41 41 'type' => 'string', 42 'deft' => q^^ } 42 'deft' => q^^ }, 43 { 'name' => "noimages", 44 'desc' => "Don't attempt to extract images from PDF.", 45 'type' => "flag" }, 46 { 'name' => "complex", 47 'desc' => "Create more complex output. With this option set the output html will look much more like the original PDF file. For this to function properly you Ghostscript installed (for *nix gs should be on your path while for windows you must have gswin32c.exe on your path).", 48 'type' => "flag" }, 49 { 'name' => "zoom", 50 'desc' => "The factor by which to zoomthe PDF for output (this is only useful if -complex is set).", 51 'type' => "int" } 43 52 ]; 44 53 … … 50 59 sub new { 51 60 my $class = shift (@_); 61 62 my ($noimages, $complex, $zoom); 63 64 if (!parsargv::parse(\@_, 65 q^noimages^, \$noimages, 66 q^complex^, \$complex, 67 q^zoom/\d+/2^, \$zoom, 68 "allow_extra_options")) { 69 70 print STDERR "\nIncorrect options passed to PDFPlug, check your collect.cfg configuration file\n"; 71 &print_usage(); 72 die "\n"; 73 } 52 74 53 75 # following title_sub removes "Page 1" added by pdftohtml, and a leading … … 81 103 my $option_list = $self->{'option_list'}; 82 104 push( @{$option_list}, $options ); 105 106 # these are passed through to gsConvert.pl by ConvertToPlug.pm 107 $self->{'convert_options'} = "-pdf_zoom $zoom"; 108 $self->{'convert_options'} .= " -pdf_complex" if $complex; 109 $self->{'convert_options'} .= " -pdf_ignore_images" if $noimages; 110 111 # pdftohtml will always produce html files encoded as utf-8 112 if ($self->{'input_encoding'} eq "auto") { 113 $self->{'input_encoding'} = "utf8"; 114 $self->{'extract_language'} = 1; 115 } 83 116 84 117 return bless $self, $class; … … 89 122 print STDERR "\n usage: plugin PDFPlug [options]\n\n"; 90 123 print STDERR " options:\n"; 91 print STDERR " -convert_to (html|text) plugin converts to TEXT or HTML\n"; 92 print STDERR " (default html)\n"; 93 print STDERR " -use_sections create a separate section for each page of the PDF file.\n\n"; 124 print STDERR " -convert_to (html|text) Convert to TEXT or HTML (default html)\n"; 125 print STDERR " -use_sections Create a separate section for each page\n"; 126 print STDERR " of the PDF file.\n"; 127 print STDERR " -noimages Don't attempt to extract images from PDF.\n"; 128 print STDERR " -complex Create more complex output. With this option\n"; 129 print STDERR " set the output html will look much more like\n"; 130 print STDERR " the original PDF file. For this to function\n"; 131 print STDERR " properly you Ghostscript installed (for *nix\n"; 132 print STDERR " gs should be on your path while for windows\n"; 133 print STDERR " you must have gswin32c.exe on your path).\n"; 134 print STDERR " -zoom The factor by which to zoomthe PDF for output\n"; 135 print STDERR " (this is only useful if -complex is set).\n\n"; 94 136 } 95 137
Note:
See TracChangeset
for help on using the changeset viewer.