Show
Ignore:
Timestamp:
19.07.2018 19:54:32 (16 months ago)
Author:
ak19
Message:

1. Making paged_pretty_html the default rather than pretty_html, since it's likely more users will want their converted PDF sectionalised. 2. Hopefully improved the display strings to make sense for users rather than for me.

Location:
main/trunk/greenstone2
Files:
4 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/gsConvert.pl

    r32287 r32290  
    124124             "type/$type_re/", \$input_type, 
    125125             '/errlog/.*/', \$faillogfile, 
    126              'output/(auto|html|text|pagedimg).*/', \$output_type, # regex includes html_multi and paged_html besides html 
     126             'output/(auto|html|text|pagedimg).*/', \$output_type, # regex includes html_multi and (paged_)pretty_html besides html, as well as pagedimgtxt_<imgext> besides pagedimg_<imgext> 
    127127             'timeout/\d+/0',\$timeout, 
    128128             'verbose/\d+/0', \$verbose, 
     
    360360    elsif ($pdf_tool eq "xpdftools" ) { 
    361361     
    362     # default to pretty html output 
     362    # default to paged_pretty_html output 
    363363    if (!$output_type) { 
    364         $output_type = "pretty_html"; 
     364        $output_type = "paged_pretty_html"; 
    365365    } 
    366366     
  • main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm

    r32289 r32290  
    165165    if ($self->{'use_realistic_book'}) { 
    166166    if ($self->{'convert_to'} ne "html") { 
    167         print STDERR "PDFs will be converted to HTML for realistic book functionality\n"; 
     167        &gsprintf::gsprintf(STDERR, "PDFv2Plugin: {PDFPlugin.html_for_realistic_book}\n"); 
    168168        $self->{'convert_to'} = "html"; 
    169169    } 
  • main/trunk/greenstone2/perllib/plugins/PDFv2Plugin.pm

    r32287 r32290  
    7676       'reqd' => "yes", 
    7777       'list' => $convert_to_list,  
    78        'deft' => "pretty_html" },     
     78       'deft' => "paged_pretty_html" },   
    7979     { 'name' => "process_exp", 
    8080       'desc' => "{BaseImporter.process_exp}", 
     
    8686       'type' => "regexp", 
    8787       'deft' => &get_default_block_exp() }, 
    88      { 'name' => "metadata_fields", 
    89        'desc' => "{HTMLPlugin.metadata_fields}", 
    90        'type' => "string", 
    91        'deft' => "Title,Author,Subject,Keywords" }, 
    92      { 'name' => "metadata_field_separator", 
    93     'desc' => "{HTMLPlugin.metadata_field_separator}", 
    94     'type' => "string", 
    95     'deft' => "" }, 
     88#     { 'name' => "metadata_fields", 
     89#       'desc' => "{HTMLPlugin.metadata_fields}", 
     90#       'type' => "string", 
     91#       'deft' => "Title,Author,Subject,Keywords" }, 
     92#     { 'name' => "metadata_field_separator", 
     93#   'desc' => "{HTMLPlugin.metadata_field_separator}", 
     94#   'type' => "string", 
     95#   'deft' => "" }, 
    9696     { 'name' => "dpi", 
    9797       'desc' => "{PDFv2Plugin.dpi}", 
     
    106106      { 'name' => "use_realistic_book", 
    107107        'desc' => "{PDFPlugin.use_realistic_book}", 
    108     'type' => "flag"} 
     108    'type' => "flag" } 
    109109     ]; 
    110110 
     
    126126    my $pdfbox_converter_self = new PDFBoxConverter($pluginlist, $inputargs, $hashArgOptLists); 
    127127    my $cbf_self = new ConvertBinaryFile($pluginlist, $inputargs, $hashArgOptLists); 
    128     my $self = BaseImporter::merge_inheritance($pdfbox_converter_self, $cbf_self); 
     128    my $self = BaseImporter::merge_inheritance($pdfbox_converter_self, $cbf_self); # this param order seems necessary to preserve the default/user-selected value for the convert_to option 
    129129     
    130130    if ($self->{'info_only'}) { 
     
    151151 
    152152    if ($self->{'convert_to'} eq "auto") { 
    153     # choose pretty_html is the best default option when using xpdftools 
    154     $self->{'convert_to'} = "pretty_html"; 
     153    # defaulting to paged_pretty_html, as it's the best default option when using xpdftools 
     154    $self->{'convert_to'} = "paged_pretty_html"; 
     155    &gsprintf::gsprintf(STDERR, "PDFv2Plugin: {PDFv2Plugin.auto_output_default}\n", $self->{'convert_to'}); 
    155156    } 
    156157    if ($self->{'use_realistic_book'}) { 
    157158    if ($self->{'convert_to'} ne "html") { 
    158         print STDERR "PDFs will be converted to HTML for realistic book functionality\n"; 
     159        &gsprintf::gsprintf(STDERR, "PDFv2Plugin: {PDFPlugin.html_for_realistic_book}\n"); 
    159160        $self->{'convert_to'} = "html"; 
    160161    } 
     
    424425    # Copying file open/close code from CommonUtil::utf8_write_file() 
    425426    if (!open (OUTFILE, ">:utf8", $output_filename)) { 
    426     gsprintf(STDERR, "PDFv2Plugin::xpdftohtml_convert_post_process {CommonUtil.could_not_open_for_writing} ($!)\n", $output_filename); 
     427    &gsprintf::gsprintf(STDERR, "PDFv2Plugin::xpdftohtml_convert_post_process {CommonUtil.could_not_open_for_writing} ($!)\n", $output_filename); 
    427428    die "\n"; 
    428429    } 
  • main/trunk/greenstone2/perllib/strings.properties

    r32287 r32290  
    845845ConvertBinaryFile.convert_to.text:Plain text format. 
    846846 
    847 ConvertBinaryFile.convert_to.paged_text:Text separately extracted for each individual page. 
     847ConvertBinaryFile.convert_to.paged_text:Sectionalised plain text, where every page's text is its own section. 
    848848 
    849849ConvertBinaryFile.convert_to.pagedimg:A series of images. 
     
    11711171PDFPlugin.complex:Create more complex output. With this option set the output html will look much more like the original PDF file. For this to function properly you Ghostscript installed (for *nix gs should be on your path while for windows you must have gswin32c.exe on your path). 
    11721172 
    1173 PDFPlugin.convert_to.html:HTML. Text only, no images. 
    1174  
    1175 PDFPlugin.convert_to.pretty_html:A series of HTML pages, one for each page. Each HTML page contains selectable text positionally overlaid on top of a screenshot of the PDF page background comprising any images, tables and drawings. 
    1176  
    1177 PDFPlugin.convert_to.paged_pretty_html:Sectionalised variant of pretty_html to allow jumping to individual pages. 
     1173PDFPlugin.convert_to.html:very basic HTML comprising just the extracted text, no images. 
     1174 
     1175PDFPlugin.convert_to.pretty_html:Each PDF page as HTML containing selectable text positionally overlaid on top of a textless screenshot of the PDF page. 
     1176 
     1177PDFPlugin.convert_to.paged_pretty_html:Sectionalised pretty_html, where each page's html is its own section. 
    11781178 
    11791179PDFPlugin.deprecated_plugin:*************IMPORTANT******************\nPDFPlugin is being deprecated.\nConsider upgrading to the recommended PDFv2Plugin, which supports newer versions of PDFs.\nAlternatively, if you wish to retain the old style of conversion and are NOT relying on PDFBox,\nchange to PDFv1Plugin.\nIf you are using PDFBox then upgrade to PDFv2Plugin.\n*****************************************\n 
     
    11811181PDFPlugin.desc:Plugin that processes PDF documents using the older pdftohtml tool. Does not support newer PDF versions. 
    11821182 
     1183PDFPlugin.html_for_realistic_book:PDFs will be converted to HTML for realistic book functionality 
     1184 
    11831185PDFPlugin.nohidden:Prevent pdftohtml from attempting to extract hidden text. This is only useful if the -complex option is also set. 
    11841186 
    11851187PDFPlugin.noimages:Don't attempt to extract images from PDF. 
    11861188 
     1189PDFv2Plugin.auto_output_default:Defaulting to output format %s 
     1190 
    11871191PDFPlugin.use_realistic_book:Converts the PDF to a well-formed XHTML document to enable users view it in the realistic book format. 
    11881192 
     
    11951199PDFv1Plugin.zoom:The factor by which to zoom the PDF for output. Only useful if -complex is set. 
    11961200 
    1197 PDFv2Plugin.dpi:The resolution in DPI of background images generated for pagedimg(txt) and (paged_)pretty_html output settings. 
     1201PDFv2Plugin.dpi:The resolution in DPI of background images generated when convert_to is set to any of the pagedimg(txt) and (paged_)pretty_html formats. 
    11981202 
    11991203PostScriptPlugin.desc:This is a \"poor man's\" ps to text converter. If you are serious, consider using the PRESCRIPT package, which is available for download at http://www.nzdl.org/html/software.html