Changeset 32277 for main

Show
Ignore:
Timestamp:
16.07.2018 21:28:00 (13 months ago)
Author:
ak19
Message:

First attempt at PDFv2Plugin.pm.

Location:
main/trunk/greenstone2
Files:
1 added
5 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/gsConvert.pl

    r32273 r32277  
    7878    print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n"; 
    7979    print STDERR "\t-errlog\t<filename>\t(append err messages)\n"; 
    80     print STDERR "\t-output\tauto|html|paged_html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n"; 
     80    print STDERR "\t-output\tauto|html|pretty_html|paged_pretty_html|paged_html|text|paged_text|pagedimg_jpg|pagedimg_gif|pagedimg_png|pagedimgtxt_jpg|pagedimgtxt_png\t(output file type)\n"; 
    8181    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n"; 
    8282    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n"; 
     
    318318    my $success = 0; 
    319319    $output_type =~ s/.*\-(.*)/$1/i; 
    320  
     320     
     321    print STDERR "@@@@@@@@ Using $pdf_tool for the conversion\n"; 
     322     
    321323    # First determine which pdf conversion tool we're using among pdftohtml/pdfbox/xpdftools 
    322324    # and then decide which conversion command to run based on the output type  
     
    353355  } 
    354356     
    355   elsif ($pdf_tool eq "xpdftools" ) { 
    356     # default to html output 
     357    elsif ($pdf_tool eq "xpdftools" ) { 
     358     
     359    # default to pretty html output 
    357360    if (!$output_type) { 
    358         $output_type = "html"; 
     361        $output_type = "pretty_html"; 
    359362    } 
    360363     
     
    367370    #} 
    368371     
    369     # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. 
    370     if ($output_type =~ m/^(paged_html|html)$/i) { 
     372    # Attempt conversion to (paged) pretty HTML using the newer pdftohtml of Xpdftools. 
     373    if ($output_type =~ m/pretty_html$/i) { 
    371374        $success = &xpdf_to_html($dirname, $input_filename, $output_filestem); 
    372375        if ($success) { 
     
    376379     
    377380    # Attempt conversion to TEXT 
    378     if (!$output_type || ($output_type =~ m/text/i)) {       
    379         $success = &xpdf_to_text($dirname, $input_filename, $output_filestem); 
     381    # Proper paged_text processing not yet implemented with xpdf 
     382    if ($output_type =~ m/text/i) { 
     383        $success = &xpdf_to_text($dirname, $input_filename, $output_filestem, $output_type); 
    380384         
    381385        if ($success) { 
     
    383387        } 
    384388    } 
    385   } 
    386      
     389    } 
     390     
    387391    return "fail"; 
    388392 
     
    10481052# Works for Windows too, whereas the old pdftotxt didn't 
    10491053sub xpdf_to_text { 
    1050     my ($dirname, $input_filename, $output_filestem) = @_; 
     1054    my ($dirname, $input_filename, $output_filestem, $output_type) = @_; 
    10511055 
    10521056    my $cmd = ""; 
     
    10641068        $cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html 
    10651069    } 
     1070 
     1071    if ($output_type ne "paged_text") { # output_type eq "text", don't bother about page break markers 
    10661072    $cmd .= " -nopgbrk"; 
     1073    } 
    10671074    # Avoid the silly solitary carriage returns (CR in Notepad) at the end 
    10681075    # of lines that ends up as \n appended to the doc title 
  • main/trunk/greenstone2/perllib/plugins/ConvertBinaryFile.pm

    r32206 r32277  
    161161    } 
    162162 
    163     if ($convert_to =~ /^html/ || $convert_to eq "paged_html") { # may be html or html_multi, or paged_html with the new Xpdf's own pdftohtml 
     163    if ($convert_to =~ /^html/ || $convert_to =~ /pretty_html$/) { # may be html or html_multi, or )paged_)pretty_html with the new Xpdf's own pdftohtml 
    164164    $self->{'convert_to_plugin'} = "HTMLPlugin"; 
    165165    $self->{'convert_to_ext'} = "html"; 
  • main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm

    r32275 r32277  
    159159    # check convert_to 
    160160    if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) { 
    161     &gsprintf::gsprintf(STDERR, "{PDFPlugin.win_old_pdftotext_unsupported}\n", "PDFPlugin"); 
     161    &gsprintf::gsprintf(STDERR, "{PDFPlugin.win_old_pdftotext_unsupported}\n"); 
    162162    $self->{'convert_to'} = "html"; 
    163163    } 
  • main/trunk/greenstone2/perllib/plugins/PDFv1Plugin.pm

    r32275 r32277  
    9393       'type' => "flag" }, 
    9494     { 'name' => "zoom", 
    95        'desc' => "{PDFPlugin.zoom}", 
     95       'desc' => "{PDFv1Plugin.zoom}", 
    9696       'deft' => "2", 
    9797       'range' => "1,3", # actually the range is 0.5-3  
     
    147147    # check convert_to 
    148148    if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) { 
    149     &gsprintf::gsprintf(STDERR, "{PDFPlugin.win_old_pdftotext_unsupported}\n", "PDFv1Plugin"); 
     149    &gsprintf::gsprintf(STDERR, "{PDFv1Plugin.win_old_pdftotext_unsupported}\n"); 
    150150    $self->{'convert_to'} = "html"; 
    151151    } 
  • main/trunk/greenstone2/perllib/strings.properties

    r32275 r32277  
    845845ConvertBinaryFile.convert_to.text:Plain text format. 
    846846 
     847ConvertBinaryFile.convert_to.paged_text:Text separately extracted for each individual page. 
     848 
    847849ConvertBinaryFile.convert_to.pagedimg:A series of images. 
    848850 
     
    852854 
    853855ConvertBinaryFile.convert_to.pagedimg_png:A series of images in PNG format.  
     856 
     857ConvertBinaryFile.convert_to.pagedimgtxt_jpg:A series of images in JPEG format with any extracted text, one for each page. 
     858 
     859ConvertBinaryFile.convert_to.pagedimgtxt_png:A series of images in PNG format with any extracted text, one for each page. 
    854860 
    855861ConvertBinaryFile.desc:This plugin is inherited by such plugins as WordPlugin, PowerPointPlugin, PostScriptPlugin, RTFPlugin and PDFPlugin. It facilitates the conversion of these document types to either HTML, TEXT or a series of images. It works by dynamically loading an appropriate secondary plugin (HTMLPlugin, StructuredHTMLPlugin, PagedImagePlugin or TextPlugin) based on the plugin argument 'convert_to'.  
     
    11651171PDFPlugin.complex:Create more complex output. With this option set the output html will look much more like the original PDF file. For this to function properly you Ghostscript installed (for *nix gs should be on your path while for windows you must have gswin32c.exe on your path). 
    11661172 
    1167 PDFPlugin.convert_to.paged_html:A series of HTML pages, one for each page. Each HTML page contains selectable text positionally overlaid on top of a screenshot of the PDF page background comprising any images, tables and drawings. 
     1173PDFPlugin.convert_to.html:HTML. Text only, no images. 
     1174 
     1175PDFPlugin.convert_to.pretty_html:A series of HTML pages, one for each page. Each HTML page contains selectable text positionally overlaid on top of a screenshot of the PDF page background comprising any images, tables and drawings. 
     1176 
     1177PDFPlugin.convert_to.paged_pretty_html:Sectionalised variant of pretty_html to allow jumping to individual pages. 
    11681178 
    11691179PDFPlugin.deprecated_plugin:*************IMPORTANT******************\nPDFPlugin is being deprecated.\nConsider upgrading to the recommended PDFv2Plugin, which supports newer versions of PDFs.\nAlternatively, if you wish to retain the old style of conversion and are NOT relying on PDFBox,\nchange to PDFv1Plugin.\nIf you are using PDFBox then upgrade to PDFv2Plugin.\n*****************************************\n 
     
    11791189PDFPlugin.use_sections:Create a separate section for each page of the PDF file. 
    11801190 
    1181 PDFPlugin.win_old_pdftotext_unsupported:*** On Windows, %s does not support pdf to text. PDFs will be converted to HTML instead.\n*** Use PDFv2Plugin if you really want pdf to text conversion. 
    1182  
    1183 PDFPlugin.zoom:The factor by which to zoom the PDF for output. If not outputting as paged_html, then zoom is only useful if -complex is set. 
     1191PDFPlugin.win_old_pdftotext_unsupported:*** On Windows, PDFPlugin pdfbox_conversion must be turned on for text output. PDFs will be converted to HTML instead.\n*** Use PDFv2Plugin for additional pdf to text conversion options. 
     1192 
     1193PDFv1Plugin.win_old_pdftotext_unsupported:*** On Windows, PDFv1Plugin does not support pdf to text. PDFs will be converted to HTML instead.\n*** Use PDFv2Plugin if you want pdf to actual text conversion. 
     1194 
     1195PDFv1Plugin.zoom:The factor by which to zoom the PDF for output. Only useful if -complex is set. 
     1196 
     1197PDFv2Plugin.zoom:The factor by which to zoom the PDF for (paged_)pretty_html output. Can be fractional. 
     1198 
     1199PDFv2Plugin.win_pdftotext_info:PDFv2Plugin uses Xpdf Tools to support pdf to text conversion, including on Windows. 
     1200 
     1201PDFv2Plugin.conversion_needs_pdfbox:*** Conversion to %s not supported with Xpdf Tools, defaulting to %s. Turn on pdfbox_conversion if you wish to enable output to selected format. 
    11841202 
    11851203PostScriptPlugin.desc:This is a \"poor man's\" ps to text converter. If you are serious, consider using the PRESCRIPT package, which is available for download at http://www.nzdl.org/html/software.html