Changeset 32277 for main/trunk/greenstone2/bin/script/gsConvert.pl
- Timestamp:
- 2018-07-16T21:28:00+12:00 (6 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/bin/script/gsConvert.pl
r32273 r32277 78 78 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n"; 79 79 print STDERR "\t-errlog\t<filename>\t(append err messages)\n"; 80 print STDERR "\t-output\tauto|html|p aged_html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";80 print STDERR "\t-output\tauto|html|pretty_html|paged_pretty_html|paged_html|text|paged_text|pagedimg_jpg|pagedimg_gif|pagedimg_png|pagedimgtxt_jpg|pagedimgtxt_png\t(output file type)\n"; 81 81 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n"; 82 82 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n"; … … 318 318 my $success = 0; 319 319 $output_type =~ s/.*\-(.*)/$1/i; 320 320 321 print STDERR "@@@@@@@@ Using $pdf_tool for the conversion\n"; 322 321 323 # First determine which pdf conversion tool we're using among pdftohtml/pdfbox/xpdftools 322 324 # and then decide which conversion command to run based on the output type … … 353 355 } 354 356 355 elsif ($pdf_tool eq "xpdftools" ) { 356 # default to html output 357 elsif ($pdf_tool eq "xpdftools" ) { 358 359 # default to pretty html output 357 360 if (!$output_type) { 358 $output_type = " html";361 $output_type = "pretty_html"; 359 362 } 360 363 … … 367 370 #} 368 371 369 # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools.370 if ($output_type =~ m/ ^(paged_html|html)$/i) {372 # Attempt conversion to (paged) pretty HTML using the newer pdftohtml of Xpdftools. 373 if ($output_type =~ m/pretty_html$/i) { 371 374 $success = &xpdf_to_html($dirname, $input_filename, $output_filestem); 372 375 if ($success) { … … 376 379 377 380 # Attempt conversion to TEXT 378 if (!$output_type || ($output_type =~ m/text/i)) { 379 $success = &xpdf_to_text($dirname, $input_filename, $output_filestem); 381 # Proper paged_text processing not yet implemented with xpdf 382 if ($output_type =~ m/text/i) { 383 $success = &xpdf_to_text($dirname, $input_filename, $output_filestem, $output_type); 380 384 381 385 if ($success) { … … 383 387 } 384 388 } 385 }386 389 } 390 387 391 return "fail"; 388 392 … … 1048 1052 # Works for Windows too, whereas the old pdftotxt didn't 1049 1053 sub xpdf_to_text { 1050 my ($dirname, $input_filename, $output_filestem ) = @_;1054 my ($dirname, $input_filename, $output_filestem, $output_type) = @_; 1051 1055 1052 1056 my $cmd = ""; … … 1064 1068 $cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html 1065 1069 } 1070 1071 if ($output_type ne "paged_text") { # output_type eq "text", don't bother about page break markers 1066 1072 $cmd .= " -nopgbrk"; 1073 } 1067 1074 # Avoid the silly solitary carriage returns (CR in Notepad) at the end 1068 1075 # of lines that ends up as \n appended to the doc title
Note:
See TracChangeset
for help on using the changeset viewer.