Changeset 32277


Ignore:
Timestamp:
2018-07-16T21:28:00+12:00 (6 years ago)
Author:
ak19
Message:

First attempt at PDFv2Plugin.pm.

Location:
main/trunk/greenstone2
Files:
1 added
5 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/gsConvert.pl

    r32273 r32277  
    7878    print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
    7979    print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
    80     print STDERR "\t-output\tauto|html|paged_html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
     80    print STDERR "\t-output\tauto|html|pretty_html|paged_pretty_html|paged_html|text|paged_text|pagedimg_jpg|pagedimg_gif|pagedimg_png|pagedimgtxt_jpg|pagedimgtxt_png\t(output file type)\n";
    8181    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
    8282    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
     
    318318    my $success = 0;
    319319    $output_type =~ s/.*\-(.*)/$1/i;
    320 
     320   
     321    print STDERR "@@@@@@@@ Using $pdf_tool for the conversion\n";
     322   
    321323    # First determine which pdf conversion tool we're using among pdftohtml/pdfbox/xpdftools
    322324    # and then decide which conversion command to run based on the output type
     
    353355  }
    354356   
    355   elsif ($pdf_tool eq "xpdftools" ) {
    356     # default to html output
     357    elsif ($pdf_tool eq "xpdftools" ) {
     358   
     359    # default to pretty html output
    357360    if (!$output_type) {
    358         $output_type = "html";
     361        $output_type = "pretty_html";
    359362    }
    360363   
     
    367370    #}
    368371   
    369     # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools.
    370     if ($output_type =~ m/^(paged_html|html)$/i) {
     372    # Attempt conversion to (paged) pretty HTML using the newer pdftohtml of Xpdftools.
     373    if ($output_type =~ m/pretty_html$/i) {
    371374        $success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
    372375        if ($success) {
     
    376379   
    377380    # Attempt conversion to TEXT
    378     if (!$output_type || ($output_type =~ m/text/i)) {     
    379         $success = &xpdf_to_text($dirname, $input_filename, $output_filestem);
     381    # Proper paged_text processing not yet implemented with xpdf
     382    if ($output_type =~ m/text/i) {
     383        $success = &xpdf_to_text($dirname, $input_filename, $output_filestem, $output_type);
    380384       
    381385        if ($success) {
     
    383387        }
    384388    }
    385   }
    386    
     389    }
     390   
    387391    return "fail";
    388392
     
    10481052# Works for Windows too, whereas the old pdftotxt didn't
    10491053sub xpdf_to_text {
    1050     my ($dirname, $input_filename, $output_filestem) = @_;
     1054    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
    10511055
    10521056    my $cmd = "";
     
    10641068        $cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html
    10651069    }
     1070
     1071    if ($output_type ne "paged_text") { # output_type eq "text", don't bother about page break markers
    10661072    $cmd .= " -nopgbrk";
     1073    }
    10671074    # Avoid the silly solitary carriage returns (CR in Notepad) at the end
    10681075    # of lines that ends up as \n appended to the doc title
  • main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

    r32206 r32277  
    161161    }
    162162
    163     if ($convert_to =~ /^html/ || $convert_to eq "paged_html") { # may be html or html_multi, or paged_html with the new Xpdf's own pdftohtml
     163    if ($convert_to =~ /^html/ || $convert_to =~ /pretty_html$/) { # may be html or html_multi, or )paged_)pretty_html with the new Xpdf's own pdftohtml
    164164    $self->{'convert_to_plugin'} = "HTMLPlugin";
    165165    $self->{'convert_to_ext'} = "html";
  • main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm

    r32275 r32277  
    159159    # check convert_to
    160160    if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
    161     &gsprintf::gsprintf(STDERR, "{PDFPlugin.win_old_pdftotext_unsupported}\n", "PDFPlugin");
     161    &gsprintf::gsprintf(STDERR, "{PDFPlugin.win_old_pdftotext_unsupported}\n");
    162162    $self->{'convert_to'} = "html";
    163163    }
  • main/trunk/greenstone2/perllib/plugins/PDFv1Plugin.pm

    r32275 r32277  
    9393       'type' => "flag" },
    9494     { 'name' => "zoom",
    95        'desc' => "{PDFPlugin.zoom}",
     95       'desc' => "{PDFv1Plugin.zoom}",
    9696       'deft' => "2",
    9797       'range' => "1,3", # actually the range is 0.5-3
     
    147147    # check convert_to
    148148    if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
    149     &gsprintf::gsprintf(STDERR, "{PDFPlugin.win_old_pdftotext_unsupported}\n", "PDFv1Plugin");
     149    &gsprintf::gsprintf(STDERR, "{PDFv1Plugin.win_old_pdftotext_unsupported}\n");
    150150    $self->{'convert_to'} = "html";
    151151    }
  • main/trunk/greenstone2/perllib/strings.properties

    r32275 r32277  
    845845ConvertBinaryFile.convert_to.text:Plain text format.
    846846
     847ConvertBinaryFile.convert_to.paged_text:Text separately extracted for each individual page.
     848
    847849ConvertBinaryFile.convert_to.pagedimg:A series of images.
    848850
     
    852854
    853855ConvertBinaryFile.convert_to.pagedimg_png:A series of images in PNG format.
     856
     857ConvertBinaryFile.convert_to.pagedimgtxt_jpg:A series of images in JPEG format with any extracted text, one for each page.
     858
     859ConvertBinaryFile.convert_to.pagedimgtxt_png:A series of images in PNG format with any extracted text, one for each page.
    854860
    855861ConvertBinaryFile.desc:This plugin is inherited by such plugins as WordPlugin, PowerPointPlugin, PostScriptPlugin, RTFPlugin and PDFPlugin. It facilitates the conversion of these document types to either HTML, TEXT or a series of images. It works by dynamically loading an appropriate secondary plugin (HTMLPlugin, StructuredHTMLPlugin, PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.
     
    11651171PDFPlugin.complex:Create more complex output. With this option set the output html will look much more like the original PDF file. For this to function properly you Ghostscript installed (for *nix gs should be on your path while for windows you must have gswin32c.exe on your path).
    11661172
    1167 PDFPlugin.convert_to.paged_html:A series of HTML pages, one for each page. Each HTML page contains selectable text positionally overlaid on top of a screenshot of the PDF page background comprising any images, tables and drawings.
     1173PDFPlugin.convert_to.html:HTML. Text only, no images.
     1174
     1175PDFPlugin.convert_to.pretty_html:A series of HTML pages, one for each page. Each HTML page contains selectable text positionally overlaid on top of a screenshot of the PDF page background comprising any images, tables and drawings.
     1176
     1177PDFPlugin.convert_to.paged_pretty_html:Sectionalised variant of pretty_html to allow jumping to individual pages.
    11681178
    11691179PDFPlugin.deprecated_plugin:*************IMPORTANT******************\nPDFPlugin is being deprecated.\nConsider upgrading to the recommended PDFv2Plugin, which supports newer versions of PDFs.\nAlternatively, if you wish to retain the old style of conversion and are NOT relying on PDFBox,\nchange to PDFv1Plugin.\nIf you are using PDFBox then upgrade to PDFv2Plugin.\n*****************************************\n
     
    11791189PDFPlugin.use_sections:Create a separate section for each page of the PDF file.
    11801190
    1181 PDFPlugin.win_old_pdftotext_unsupported:*** On Windows, %s does not support pdf to text. PDFs will be converted to HTML instead.\n*** Use PDFv2Plugin if you really want pdf to text conversion.
    1182 
    1183 PDFPlugin.zoom:The factor by which to zoom the PDF for output. If not outputting as paged_html, then zoom is only useful if -complex is set.
     1191PDFPlugin.win_old_pdftotext_unsupported:*** On Windows, PDFPlugin pdfbox_conversion must be turned on for text output. PDFs will be converted to HTML instead.\n*** Use PDFv2Plugin for additional pdf to text conversion options.
     1192
     1193PDFv1Plugin.win_old_pdftotext_unsupported:*** On Windows, PDFv1Plugin does not support pdf to text. PDFs will be converted to HTML instead.\n*** Use PDFv2Plugin if you want pdf to actual text conversion.
     1194
     1195PDFv1Plugin.zoom:The factor by which to zoom the PDF for output. Only useful if -complex is set.
     1196
     1197PDFv2Plugin.zoom:The factor by which to zoom the PDF for (paged_)pretty_html output. Can be fractional.
     1198
     1199PDFv2Plugin.win_pdftotext_info:PDFv2Plugin uses Xpdf Tools to support pdf to text conversion, including on Windows.
     1200
     1201PDFv2Plugin.conversion_needs_pdfbox:*** Conversion to %s not supported with Xpdf Tools, defaulting to %s. Turn on pdfbox_conversion if you wish to enable output to selected format.
    11841202
    11851203PostScriptPlugin.desc:This is a \"poor man's\" ps to text converter. If you are serious, consider using the PRESCRIPT package, which is available for download at http://www.nzdl.org/html/software.html
Note: See TracChangeset for help on using the changeset viewer.