Changeset 32273

Show
Ignore:
Timestamp:
13.07.2018 20:40:24 (9 months ago)
Author:
ak19
Message:

First of the commits to do with restructuring and refactoring the PDFPlugin. 1. Introducing PDFv1Plugin.pm, which only runs the old pdftohtml. pdfbox_conversion are moved into PDFv2Plugin. 2. In the meantime we still have PDFPlugin, the current state of the plugin, for backward compatibility: it uses both the old pdftohtml tool and still has the pdfbox_conversion option. Yet to introduced the PDFv2Plugin. 3. gsConvert.pl has the new flag pdf_tool, set/passed in by PDFPlugin.pm and all PDFPlugin classes hereafter. The pdf_tool flag can be set to pdftohtml, xpdftools or pdfbox. PDFv1Plugin will always set it to pdftohtml, to denote the old pdftohtml tool is to be used, whereas PDFv2Plugin will set it to xpdftools and PDFBoxConverter sets it for symmetry's sake to pdfbox, even though being an AutoLoadConverter? at present, the PDFBoxConverter class bypasses gsConvert.pl. gsConvert.pl uses the pdf_tool flag to determine which tool is to be used to do the conversion to produce the selected output_type. 4. Added some strings. One for migrating users to indicate that PDFPlugin was being deprecated in favour of the PDFv1 and PDFv2 plugins. Another was referenced by CommonUntil?, and more recently by PDFPlugin, but was not defined in strings.properties. Once PDFv2Plugin has been added, need to remove references to paged_html from PDFPlugin.

Files:
1 added
4 modified

Legend:

Unmodified
Added
Removed
  • gs2-extensions/pdf-box/trunk/java/perllib/plugins/PDFBoxConverter.pm

    r32198 r32273  
    172172    return 0 if ( !-f $source_file_full_path); 
    173173 
     174    # Although PDFBoxConverter inherits from AutoLoadConverters and therefore 
     175    # doesn't go through gsConvert.pl, still set the -pdf_tool flag in convert_options 
     176    # in case in future PDFBoxConverter no longer inherits from AutoLoadConverters 
     177    # and ends up going through gsConvert.pl 
     178    $self->{'convert_options'} .= " -pdf_tool pdfbox"; 
     179     
    174180    my $img_output_mode = 0; 
    175181 
  • main/trunk/greenstone2/bin/script/gsConvert.pl

    r32263 r32273  
    6161 
    6262my $use_strings; 
     63my $pdf_tool; 
    6364my $pdf_complex; 
    6465my $pdf_nohidden; 
     
    7778    print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n"; 
    7879    print STDERR "\t-errlog\t<filename>\t(append err messages)\n"; 
    79     print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n"; 
     80    print STDERR "\t-output\tauto|html|paged_html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n"; 
    8081    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n"; 
    8182    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n"; 
    8283    print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n"; 
     84    print STDERR "\t-pdf_tool\tpdftohtml|xpdftools|pdfbox (not all output types are supported by every pdf_tool)\n"; 
    8385    print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n"; 
    8486    print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n"; 
     
    120122             "type/$type_re/", \$input_type, 
    121123             '/errlog/.*/', \$faillogfile, 
    122              'output/(auto|html|text|pagedimg).*/', \$output_type, 
     124             'output/(auto|html|text|pagedimg).*/', \$output_type, # regex includes html_multi and paged_html besides html 
    123125             'timeout/\d+/0',\$timeout, 
    124126             'verbose/\d+/0', \$verbose, 
    125127             'windows_scripting',\$windows_scripting, 
    126128             'use_strings', \$use_strings, 
    127              'pdf_complex', \$pdf_complex, 
     129             'pdf_tool/(pdftohtml|pdfbox|xpdftools)/', \$pdf_tool, # the old pdftohtml tool, pdfbox extensions or the newer xpdf-tools 
     130             'pdf_complex', \$pdf_complex, # options for pdf_tool = pdftohtml (the old pdftohtml tool) 
    128131             'pdf_ignore_images', \$pdf_ignore_images, 
    129132             'pdf_allow_images_only', \$pdf_allow_images_only, 
     
    315318    my $success = 0; 
    316319    $output_type =~ s/.*\-(.*)/$1/i; 
     320 
     321    # First determine which pdf conversion tool we're using among pdftohtml/pdfbox/xpdftools 
     322    # and then decide which conversion command to run based on the output type  
     323    # (pdfbox does not currently go through gsConvert.pl 
     324    # as PDFBoxConverter inherits from AutoLoadConverters) 
     325     
     326  if ($pdf_tool eq "pdftohtml" ) { # old pdftohtml tool 
    317327    # Attempt coversion to Image 
    318328    if ($output_type =~ m/jp?g|gif|png/i) { 
     
    333343    } 
    334344 
    335     # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This 
    336     # will be the new default for PDFs when output_type for PDF docs is not specified 
    337     # (once our use of xpdftools' pdftohtml has been implemented on win and mac). 
    338     #if ($output_type =~ m/paged_html/i) { 
    339     if (!$output_type || ($output_type =~ m/paged_html/i)) { 
    340     $success = &xpdf_to_html($dirname, $input_filename, $output_filestem); 
    341     if ($success) { 
    342         return "paged_html"; 
    343     } 
    344     } 
    345  
    346     # Attempt conversion to TEXT 
     345    # Attempt conversion to TEXT (not for Windows, but PDFPlugin/PDFv1Plugin takes care of that 
    347346    if (!$output_type || ($output_type =~ m/text/i)) { 
    348         $success = &xpdf_to_text($dirname, $input_filename, $output_filestem); 
    349         #if ($ENV{'GSDLOS'} =~ m/^windows$/i) { # we now have pdf to text support for windows by using xpdf tools 
    350         #   $success = &xpdf_to_text($dirname, $input_filename, $output_filestem); 
    351         #} else { 
    352         #   $success = &pdf_to_text($dirname, $input_filename, $output_filestem); 
    353         #} 
     347    $success = &pdf_to_text($dirname, $input_filename, $output_filestem); 
     348 
    354349    if ($success) { 
    355350        return "text"; 
    356351    } 
    357352    } 
    358  
     353  } 
     354     
     355  elsif ($pdf_tool eq "xpdftools" ) { 
     356    # default to html output 
     357    if (!$output_type) { 
     358        $output_type = "html"; 
     359    } 
     360     
     361    # Attempt coversion to Image 
     362    #if ($output_type =~ m/jp?g|gif|png/i) { 
     363    #    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type); 
     364    #    if ($success){ 
     365    #   return "item"; 
     366    #    } 
     367    #} 
     368     
     369    # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. 
     370    if ($output_type =~ m/^(paged_html|html)$/i) { 
     371        $success = &xpdf_to_html($dirname, $input_filename, $output_filestem); 
     372        if ($success) { 
     373        return $output_type; 
     374        } 
     375    } 
     376     
     377    # Attempt conversion to TEXT 
     378    if (!$output_type || ($output_type =~ m/text/i)) {       
     379        $success = &xpdf_to_text($dirname, $input_filename, $output_filestem); 
     380         
     381        if ($success) { 
     382        return "text"; 
     383        } 
     384    } 
     385  } 
     386     
    359387    return "fail"; 
    360388 
  • main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm

    r32224 r32273  
    140140    $self->{'file_type'} = "PDF"; 
    141141 
     142    # PDFPlugin is deprecated and migrating users should hereafter choose between 
     143    # PDFv1Plugin, if they want to use the old pdftohtml tool's capabilities, 
     144    # and PDFv2Plugin, if they want to use pdfbox or the new xpdftools capabilities. 
     145    &gsprintf::gsprintf(STDERR, "{PDFPlugin.deprecated_plugin}"); 
     146     
    142147    # these are passed through to gsConvert.pl by ConvertBinaryFile.pm 
    143148    my $zoom = $self->{"zoom"}; 
    144     $self->{'convert_options'} = "-pdf_zoom $zoom"; 
     149    # By default, PDFPlugin assumes gsConvert.pl will run the old pdftohtml conversion tool, 
     150    # But if pdfbox conversion is turned on, the tool used is pdfbox (which is presently an 
     151    # AutoLoadConverter and therefore bypasses gsConvert.pl) 
     152    $self->{'convert_options'} = "-pdf_tool pdftohtml";  
     153    $self->{'convert_options'} .= " -pdf_zoom $zoom"; 
    145154    $self->{'convert_options'} .= " -pdf_complex" if $self->{"complex"}; 
    146155    $self->{'convert_options'} .= " -pdf_nohidden" if $self->{"nohidden"}; 
     
    151160    # TODO: Start supporting PDF to txt on Windows if we're going to be using XPDF Tools (incl pdftotext) on Windows/Linux/Mac 
    152161    if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) { 
    153         print STDERR "On Windows, Greenstone now uses Xpdf tools to support pdf to text conversion.\n"; 
    154     #print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n"; 
    155     #$self->{'convert_to'} = "html"; 
     162    #print STDERR "On Windows, Greenstone now uses Xpdf tools to support pdf to text conversion.\n"; 
     163    print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n"; 
     164    $self->{'convert_to'} = "html"; 
    156165    } 
    157166    elsif ($self->{'convert_to'} eq "auto") { 
     
    407416    # Copying file open/close code from CommonUtil::utf8_write_file() 
    408417    if (!open (OUTFILE, ">:utf8", $output_filename)) { 
    409     gsprintf(STDERR, "PDFPlugin::xpdftohtml_convert_post_process {ConvertToPlug.could_not_open_for_writing} ($!)\n", $output_filename); 
     418    gsprintf(STDERR, "PDFPlugin::xpdftohtml_convert_post_process {CommonUtil.could_not_open_for_writing} ($!)\n", $output_filename); 
    410419    die "\n"; 
    411420    } 
  • main/trunk/greenstone2/perllib/strings.properties

    r32222 r32273  
    809809CommonUtil.block_exp:Files matching this regular expression will be blocked from being passed to any later plugins in the list. 
    810810 
     811CommonUtil.could_not_open_for_writing:could not open %s for writing 
     812 
    811813CommonUtil.desc:Base Utility plugin class that handles filename encoding and file blocking. 
    812814 
     
    11651167PDFPlugin.convert_to.paged_html:A series of HTML pages, one for each page. Each HTML page contains selectable text positionally overlaid on top of a screenshot of the PDF page background comprising any images, tables and drawings. 
    11661168 
    1167 PDFPlugin.desc:Plugin that processes PDF documents. 
     1169PDFPlugin.deprecated_plugin:*************IMPORTANT******************\nPDFPlugin is being deprecated.\nConsider upgrading to the recommended PDFv2Plugin, which supports newer versions of PDFs.\nAlternatively, if you wish to retain the old style of conversion and are NOT relying on PDFBox,\nchange to PDFv1Plugin.\nIf you are using PDFBox then upgrade to PDFv2Plugin.\n*****************************************\n 
     1170 
     1171PDFPlugin.desc:Plugin that processes PDF documents using the older pdftohtml tool. Does not support newer PDF versions. 
    11681172 
    11691173PDFPlugin.nohidden:Prevent pdftohtml from attempting to extract hidden text. This is only useful if the -complex option is also set.