- Timestamp:
- 2018-07-13T20:40:24+12:00 (6 years ago)
- Location:
- main/trunk/greenstone2
- Files:
-
- 1 added
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/bin/script/gsConvert.pl
r32263 r32273 61 61 62 62 my $use_strings; 63 my $pdf_tool; 63 64 my $pdf_complex; 64 65 my $pdf_nohidden; … … 77 78 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n"; 78 79 print STDERR "\t-errlog\t<filename>\t(append err messages)\n"; 79 print STDERR "\t-output\tauto|html| text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";80 print STDERR "\t-output\tauto|html|paged_html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n"; 80 81 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n"; 81 82 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n"; 82 83 print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n"; 84 print STDERR "\t-pdf_tool\tpdftohtml|xpdftools|pdfbox (not all output types are supported by every pdf_tool)\n"; 83 85 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n"; 84 86 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n"; … … 120 122 "type/$type_re/", \$input_type, 121 123 '/errlog/.*/', \$faillogfile, 122 'output/(auto|html|text|pagedimg).*/', \$output_type, 124 'output/(auto|html|text|pagedimg).*/', \$output_type, # regex includes html_multi and paged_html besides html 123 125 'timeout/\d+/0',\$timeout, 124 126 'verbose/\d+/0', \$verbose, 125 127 'windows_scripting',\$windows_scripting, 126 128 'use_strings', \$use_strings, 127 'pdf_complex', \$pdf_complex, 129 'pdf_tool/(pdftohtml|pdfbox|xpdftools)/', \$pdf_tool, # the old pdftohtml tool, pdfbox extensions or the newer xpdf-tools 130 'pdf_complex', \$pdf_complex, # options for pdf_tool = pdftohtml (the old pdftohtml tool) 128 131 'pdf_ignore_images', \$pdf_ignore_images, 129 132 'pdf_allow_images_only', \$pdf_allow_images_only, … … 315 318 my $success = 0; 316 319 $output_type =~ s/.*\-(.*)/$1/i; 320 321 # First determine which pdf conversion tool we're using among pdftohtml/pdfbox/xpdftools 322 # and then decide which conversion command to run based on the output type 323 # (pdfbox does not currently go through gsConvert.pl 324 # as PDFBoxConverter inherits from AutoLoadConverters) 325 326 if ($pdf_tool eq "pdftohtml" ) { # old pdftohtml tool 317 327 # Attempt coversion to Image 318 328 if ($output_type =~ m/jp?g|gif|png/i) { … … 333 343 } 334 344 335 # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This 336 # will be the new default for PDFs when output_type for PDF docs is not specified 337 # (once our use of xpdftools' pdftohtml has been implemented on win and mac). 338 #if ($output_type =~ m/paged_html/i) { 339 if (!$output_type || ($output_type =~ m/paged_html/i)) { 340 $success = &xpdf_to_html($dirname, $input_filename, $output_filestem); 341 if ($success) { 342 return "paged_html"; 343 } 344 } 345 346 # Attempt conversion to TEXT 345 # Attempt conversion to TEXT (not for Windows, but PDFPlugin/PDFv1Plugin takes care of that 347 346 if (!$output_type || ($output_type =~ m/text/i)) { 348 $success = &xpdf_to_text($dirname, $input_filename, $output_filestem); 349 #if ($ENV{'GSDLOS'} =~ m/^windows$/i) { # we now have pdf to text support for windows by using xpdf tools 350 # $success = &xpdf_to_text($dirname, $input_filename, $output_filestem); 351 #} else { 352 # $success = &pdf_to_text($dirname, $input_filename, $output_filestem); 353 #} 347 $success = &pdf_to_text($dirname, $input_filename, $output_filestem); 348 354 349 if ($success) { 355 350 return "text"; 356 351 } 357 352 } 358 353 } 354 355 elsif ($pdf_tool eq "xpdftools" ) { 356 # default to html output 357 if (!$output_type) { 358 $output_type = "html"; 359 } 360 361 # Attempt coversion to Image 362 #if ($output_type =~ m/jp?g|gif|png/i) { 363 # $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type); 364 # if ($success){ 365 # return "item"; 366 # } 367 #} 368 369 # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. 370 if ($output_type =~ m/^(paged_html|html)$/i) { 371 $success = &xpdf_to_html($dirname, $input_filename, $output_filestem); 372 if ($success) { 373 return $output_type; 374 } 375 } 376 377 # Attempt conversion to TEXT 378 if (!$output_type || ($output_type =~ m/text/i)) { 379 $success = &xpdf_to_text($dirname, $input_filename, $output_filestem); 380 381 if ($success) { 382 return "text"; 383 } 384 } 385 } 386 359 387 return "fail"; 360 388 -
main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm
r32224 r32273 140 140 $self->{'file_type'} = "PDF"; 141 141 142 # PDFPlugin is deprecated and migrating users should hereafter choose between 143 # PDFv1Plugin, if they want to use the old pdftohtml tool's capabilities, 144 # and PDFv2Plugin, if they want to use pdfbox or the new xpdftools capabilities. 145 &gsprintf::gsprintf(STDERR, "{PDFPlugin.deprecated_plugin}"); 146 142 147 # these are passed through to gsConvert.pl by ConvertBinaryFile.pm 143 148 my $zoom = $self->{"zoom"}; 144 $self->{'convert_options'} = "-pdf_zoom $zoom"; 149 # By default, PDFPlugin assumes gsConvert.pl will run the old pdftohtml conversion tool, 150 # But if pdfbox conversion is turned on, the tool used is pdfbox (which is presently an 151 # AutoLoadConverter and therefore bypasses gsConvert.pl) 152 $self->{'convert_options'} = "-pdf_tool pdftohtml"; 153 $self->{'convert_options'} .= " -pdf_zoom $zoom"; 145 154 $self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"}; 146 155 $self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"}; … … 151 160 # TODO: Start supporting PDF to txt on Windows if we're going to be using XPDF Tools (incl pdftotext) on Windows/Linux/Mac 152 161 if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) { 153 154 #print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";155 #$self->{'convert_to'} = "html";162 #print STDERR "On Windows, Greenstone now uses Xpdf tools to support pdf to text conversion.\n"; 163 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n"; 164 $self->{'convert_to'} = "html"; 156 165 } 157 166 elsif ($self->{'convert_to'} eq "auto") { … … 407 416 # Copying file open/close code from CommonUtil::utf8_write_file() 408 417 if (!open (OUTFILE, ">:utf8", $output_filename)) { 409 gsprintf(STDERR, "PDFPlugin::xpdftohtml_convert_post_process {Co nvertToPlug.could_not_open_for_writing} ($!)\n", $output_filename);418 gsprintf(STDERR, "PDFPlugin::xpdftohtml_convert_post_process {CommonUtil.could_not_open_for_writing} ($!)\n", $output_filename); 410 419 die "\n"; 411 420 } -
main/trunk/greenstone2/perllib/strings.properties
r32222 r32273 809 809 CommonUtil.block_exp:Files matching this regular expression will be blocked from being passed to any later plugins in the list. 810 810 811 CommonUtil.could_not_open_for_writing:could not open %s for writing 812 811 813 CommonUtil.desc:Base Utility plugin class that handles filename encoding and file blocking. 812 814 … … 1165 1167 PDFPlugin.convert_to.paged_html:A series of HTML pages, one for each page. Each HTML page contains selectable text positionally overlaid on top of a screenshot of the PDF page background comprising any images, tables and drawings. 1166 1168 1167 PDFPlugin.desc:Plugin that processes PDF documents. 1169 PDFPlugin.deprecated_plugin:*************IMPORTANT******************\nPDFPlugin is being deprecated.\nConsider upgrading to the recommended PDFv2Plugin, which supports newer versions of PDFs.\nAlternatively, if you wish to retain the old style of conversion and are NOT relying on PDFBox,\nchange to PDFv1Plugin.\nIf you are using PDFBox then upgrade to PDFv2Plugin.\n*****************************************\n 1170 1171 PDFPlugin.desc:Plugin that processes PDF documents using the older pdftohtml tool. Does not support newer PDF versions. 1168 1172 1169 1173 PDFPlugin.nohidden:Prevent pdftohtml from attempting to extract hidden text. This is only useful if the -complex option is also set.
Note:
See TracChangeset
for help on using the changeset viewer.