Changeset 32273 for main/trunk/greenstone2/bin/script/gsConvert.pl
- Timestamp:
- 2018-07-13T20:40:24+12:00 (6 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/bin/script/gsConvert.pl
r32263 r32273 61 61 62 62 my $use_strings; 63 my $pdf_tool; 63 64 my $pdf_complex; 64 65 my $pdf_nohidden; … … 77 78 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n"; 78 79 print STDERR "\t-errlog\t<filename>\t(append err messages)\n"; 79 print STDERR "\t-output\tauto|html| text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";80 print STDERR "\t-output\tauto|html|paged_html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n"; 80 81 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n"; 81 82 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n"; 82 83 print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n"; 84 print STDERR "\t-pdf_tool\tpdftohtml|xpdftools|pdfbox (not all output types are supported by every pdf_tool)\n"; 83 85 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n"; 84 86 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n"; … … 120 122 "type/$type_re/", \$input_type, 121 123 '/errlog/.*/', \$faillogfile, 122 'output/(auto|html|text|pagedimg).*/', \$output_type, 124 'output/(auto|html|text|pagedimg).*/', \$output_type, # regex includes html_multi and paged_html besides html 123 125 'timeout/\d+/0',\$timeout, 124 126 'verbose/\d+/0', \$verbose, 125 127 'windows_scripting',\$windows_scripting, 126 128 'use_strings', \$use_strings, 127 'pdf_complex', \$pdf_complex, 129 'pdf_tool/(pdftohtml|pdfbox|xpdftools)/', \$pdf_tool, # the old pdftohtml tool, pdfbox extensions or the newer xpdf-tools 130 'pdf_complex', \$pdf_complex, # options for pdf_tool = pdftohtml (the old pdftohtml tool) 128 131 'pdf_ignore_images', \$pdf_ignore_images, 129 132 'pdf_allow_images_only', \$pdf_allow_images_only, … … 315 318 my $success = 0; 316 319 $output_type =~ s/.*\-(.*)/$1/i; 320 321 # First determine which pdf conversion tool we're using among pdftohtml/pdfbox/xpdftools 322 # and then decide which conversion command to run based on the output type 323 # (pdfbox does not currently go through gsConvert.pl 324 # as PDFBoxConverter inherits from AutoLoadConverters) 325 326 if ($pdf_tool eq "pdftohtml" ) { # old pdftohtml tool 317 327 # Attempt coversion to Image 318 328 if ($output_type =~ m/jp?g|gif|png/i) { … … 333 343 } 334 344 335 # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This 336 # will be the new default for PDFs when output_type for PDF docs is not specified 337 # (once our use of xpdftools' pdftohtml has been implemented on win and mac). 338 #if ($output_type =~ m/paged_html/i) { 339 if (!$output_type || ($output_type =~ m/paged_html/i)) { 340 $success = &xpdf_to_html($dirname, $input_filename, $output_filestem); 341 if ($success) { 342 return "paged_html"; 343 } 344 } 345 346 # Attempt conversion to TEXT 345 # Attempt conversion to TEXT (not for Windows, but PDFPlugin/PDFv1Plugin takes care of that 347 346 if (!$output_type || ($output_type =~ m/text/i)) { 348 $success = &xpdf_to_text($dirname, $input_filename, $output_filestem); 349 #if ($ENV{'GSDLOS'} =~ m/^windows$/i) { # we now have pdf to text support for windows by using xpdf tools 350 # $success = &xpdf_to_text($dirname, $input_filename, $output_filestem); 351 #} else { 352 # $success = &pdf_to_text($dirname, $input_filename, $output_filestem); 353 #} 347 $success = &pdf_to_text($dirname, $input_filename, $output_filestem); 348 354 349 if ($success) { 355 350 return "text"; 356 351 } 357 352 } 358 353 } 354 355 elsif ($pdf_tool eq "xpdftools" ) { 356 # default to html output 357 if (!$output_type) { 358 $output_type = "html"; 359 } 360 361 # Attempt coversion to Image 362 #if ($output_type =~ m/jp?g|gif|png/i) { 363 # $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type); 364 # if ($success){ 365 # return "item"; 366 # } 367 #} 368 369 # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. 370 if ($output_type =~ m/^(paged_html|html)$/i) { 371 $success = &xpdf_to_html($dirname, $input_filename, $output_filestem); 372 if ($success) { 373 return $output_type; 374 } 375 } 376 377 # Attempt conversion to TEXT 378 if (!$output_type || ($output_type =~ m/text/i)) { 379 $success = &xpdf_to_text($dirname, $input_filename, $output_filestem); 380 381 if ($success) { 382 return "text"; 383 } 384 } 385 } 386 359 387 return "fail"; 360 388
Note:
See TracChangeset
for help on using the changeset viewer.