Changeset 32277
- Timestamp:
- 2018-07-16T21:28:00+12:00 (6 years ago)
- Location:
- main/trunk/greenstone2
- Files:
-
- 1 added
- 5 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/bin/script/gsConvert.pl
r32273 r32277 78 78 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n"; 79 79 print STDERR "\t-errlog\t<filename>\t(append err messages)\n"; 80 print STDERR "\t-output\tauto|html|p aged_html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";80 print STDERR "\t-output\tauto|html|pretty_html|paged_pretty_html|paged_html|text|paged_text|pagedimg_jpg|pagedimg_gif|pagedimg_png|pagedimgtxt_jpg|pagedimgtxt_png\t(output file type)\n"; 81 81 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n"; 82 82 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n"; … … 318 318 my $success = 0; 319 319 $output_type =~ s/.*\-(.*)/$1/i; 320 320 321 print STDERR "@@@@@@@@ Using $pdf_tool for the conversion\n"; 322 321 323 # First determine which pdf conversion tool we're using among pdftohtml/pdfbox/xpdftools 322 324 # and then decide which conversion command to run based on the output type … … 353 355 } 354 356 355 elsif ($pdf_tool eq "xpdftools" ) { 356 # default to html output 357 elsif ($pdf_tool eq "xpdftools" ) { 358 359 # default to pretty html output 357 360 if (!$output_type) { 358 $output_type = " html";361 $output_type = "pretty_html"; 359 362 } 360 363 … … 367 370 #} 368 371 369 # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools.370 if ($output_type =~ m/ ^(paged_html|html)$/i) {372 # Attempt conversion to (paged) pretty HTML using the newer pdftohtml of Xpdftools. 373 if ($output_type =~ m/pretty_html$/i) { 371 374 $success = &xpdf_to_html($dirname, $input_filename, $output_filestem); 372 375 if ($success) { … … 376 379 377 380 # Attempt conversion to TEXT 378 if (!$output_type || ($output_type =~ m/text/i)) { 379 $success = &xpdf_to_text($dirname, $input_filename, $output_filestem); 381 # Proper paged_text processing not yet implemented with xpdf 382 if ($output_type =~ m/text/i) { 383 $success = &xpdf_to_text($dirname, $input_filename, $output_filestem, $output_type); 380 384 381 385 if ($success) { … … 383 387 } 384 388 } 385 }386 389 } 390 387 391 return "fail"; 388 392 … … 1048 1052 # Works for Windows too, whereas the old pdftotxt didn't 1049 1053 sub xpdf_to_text { 1050 my ($dirname, $input_filename, $output_filestem ) = @_;1054 my ($dirname, $input_filename, $output_filestem, $output_type) = @_; 1051 1055 1052 1056 my $cmd = ""; … … 1064 1068 $cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html 1065 1069 } 1070 1071 if ($output_type ne "paged_text") { # output_type eq "text", don't bother about page break markers 1066 1072 $cmd .= " -nopgbrk"; 1073 } 1067 1074 # Avoid the silly solitary carriage returns (CR in Notepad) at the end 1068 1075 # of lines that ends up as \n appended to the doc title -
main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm
r32206 r32277 161 161 } 162 162 163 if ($convert_to =~ /^html/ || $convert_to eq "paged_html") { # may be html or html_multi, or paged_html with the new Xpdf's own pdftohtml163 if ($convert_to =~ /^html/ || $convert_to =~ /pretty_html$/) { # may be html or html_multi, or )paged_)pretty_html with the new Xpdf's own pdftohtml 164 164 $self->{'convert_to_plugin'} = "HTMLPlugin"; 165 165 $self->{'convert_to_ext'} = "html"; -
main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm
r32275 r32277 159 159 # check convert_to 160 160 if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) { 161 &gsprintf::gsprintf(STDERR, "{PDFPlugin.win_old_pdftotext_unsupported}\n" , "PDFPlugin");161 &gsprintf::gsprintf(STDERR, "{PDFPlugin.win_old_pdftotext_unsupported}\n"); 162 162 $self->{'convert_to'} = "html"; 163 163 } -
main/trunk/greenstone2/perllib/plugins/PDFv1Plugin.pm
r32275 r32277 93 93 'type' => "flag" }, 94 94 { 'name' => "zoom", 95 'desc' => "{PDF Plugin.zoom}",95 'desc' => "{PDFv1Plugin.zoom}", 96 96 'deft' => "2", 97 97 'range' => "1,3", # actually the range is 0.5-3 … … 147 147 # check convert_to 148 148 if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) { 149 &gsprintf::gsprintf(STDERR, "{PDF Plugin.win_old_pdftotext_unsupported}\n", "PDFv1Plugin");149 &gsprintf::gsprintf(STDERR, "{PDFv1Plugin.win_old_pdftotext_unsupported}\n"); 150 150 $self->{'convert_to'} = "html"; 151 151 } -
main/trunk/greenstone2/perllib/strings.properties
r32275 r32277 845 845 ConvertBinaryFile.convert_to.text:Plain text format. 846 846 847 ConvertBinaryFile.convert_to.paged_text:Text separately extracted for each individual page. 848 847 849 ConvertBinaryFile.convert_to.pagedimg:A series of images. 848 850 … … 852 854 853 855 ConvertBinaryFile.convert_to.pagedimg_png:A series of images in PNG format. 856 857 ConvertBinaryFile.convert_to.pagedimgtxt_jpg:A series of images in JPEG format with any extracted text, one for each page. 858 859 ConvertBinaryFile.convert_to.pagedimgtxt_png:A series of images in PNG format with any extracted text, one for each page. 854 860 855 861 ConvertBinaryFile.desc:This plugin is inherited by such plugins as WordPlugin, PowerPointPlugin, PostScriptPlugin, RTFPlugin and PDFPlugin. It facilitates the conversion of these document types to either HTML, TEXT or a series of images. It works by dynamically loading an appropriate secondary plugin (HTMLPlugin, StructuredHTMLPlugin, PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'. … … 1165 1171 PDFPlugin.complex:Create more complex output. With this option set the output html will look much more like the original PDF file. For this to function properly you Ghostscript installed (for *nix gs should be on your path while for windows you must have gswin32c.exe on your path). 1166 1172 1167 PDFPlugin.convert_to.paged_html:A series of HTML pages, one for each page. Each HTML page contains selectable text positionally overlaid on top of a screenshot of the PDF page background comprising any images, tables and drawings. 1173 PDFPlugin.convert_to.html:HTML. Text only, no images. 1174 1175 PDFPlugin.convert_to.pretty_html:A series of HTML pages, one for each page. Each HTML page contains selectable text positionally overlaid on top of a screenshot of the PDF page background comprising any images, tables and drawings. 1176 1177 PDFPlugin.convert_to.paged_pretty_html:Sectionalised variant of pretty_html to allow jumping to individual pages. 1168 1178 1169 1179 PDFPlugin.deprecated_plugin:*************IMPORTANT******************\nPDFPlugin is being deprecated.\nConsider upgrading to the recommended PDFv2Plugin, which supports newer versions of PDFs.\nAlternatively, if you wish to retain the old style of conversion and are NOT relying on PDFBox,\nchange to PDFv1Plugin.\nIf you are using PDFBox then upgrade to PDFv2Plugin.\n*****************************************\n … … 1179 1189 PDFPlugin.use_sections:Create a separate section for each page of the PDF file. 1180 1190 1181 PDFPlugin.win_old_pdftotext_unsupported:*** On Windows, %s does not support pdf to text. PDFs will be converted to HTML instead.\n*** Use PDFv2Plugin if you really want pdf to text conversion. 1182 1183 PDFPlugin.zoom:The factor by which to zoom the PDF for output. If not outputting as paged_html, then zoom is only useful if -complex is set. 1191 PDFPlugin.win_old_pdftotext_unsupported:*** On Windows, PDFPlugin pdfbox_conversion must be turned on for text output. PDFs will be converted to HTML instead.\n*** Use PDFv2Plugin for additional pdf to text conversion options. 1192 1193 PDFv1Plugin.win_old_pdftotext_unsupported:*** On Windows, PDFv1Plugin does not support pdf to text. PDFs will be converted to HTML instead.\n*** Use PDFv2Plugin if you want pdf to actual text conversion. 1194 1195 PDFv1Plugin.zoom:The factor by which to zoom the PDF for output. Only useful if -complex is set. 1196 1197 PDFv2Plugin.zoom:The factor by which to zoom the PDF for (paged_)pretty_html output. Can be fractional. 1198 1199 PDFv2Plugin.win_pdftotext_info:PDFv2Plugin uses Xpdf Tools to support pdf to text conversion, including on Windows. 1200 1201 PDFv2Plugin.conversion_needs_pdfbox:*** Conversion to %s not supported with Xpdf Tools, defaulting to %s. Turn on pdfbox_conversion if you wish to enable output to selected format. 1184 1202 1185 1203 PostScriptPlugin.desc:This is a \"poor man's\" ps to text converter. If you are serious, consider using the PRESCRIPT package, which is available for download at http://www.nzdl.org/html/software.html
Note:
See TracChangeset
for help on using the changeset viewer.