Changeset 32224


Ignore:
Timestamp:
2018-06-27T18:30:21+12:00 (6 years ago)
Author:
ak19
Message:

Adding PDF to text support for Windows using Xpdf's pdftotext tool. Previously PDFPlugin would override the output mode to HTML on Windows if txt output mode was selected.

Location:
main/trunk/greenstone2
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/gsConvert.pl

    r32223 r32224  
    6767my $pdf_allow_images_only;
    6868my $windows_scripting;
     69my $enc;
    6970
    7071sub print_usage
     
    345346    # Attempt conversion to TEXT
    346347    if (!$output_type || ($output_type =~ m/text/i)) {
    347     $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
     348        if ($ENV{'GSDLOS'} =~ m/^windows$/i) { # we now have pdf to text support for windows by using xpdf tools
     349            $success = &xpdf_to_text($dirname, $input_filename, $output_filestem);
     350        } else {
     351            $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
     352        }
    348353    if ($success) {
    349354        return "text";
     
    846851
    847852    # build up the path to the doc-to-html conversion tool we're going to use
    848     my $xpdf_pdftohtml = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools");
    849    
    850     if($ENV{'GSDLOS'} =~ m/^windows$/i) { # For Windows, just use the 32 bit xpdf's pdftohtml as it works the same as the 64 bit
    851     $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32");
    852     } else { # unix (linux|darwin), use the bin32/bin64 folder depending on the BITNESS env var
    853    
    854     # Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since
    855     # $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit
    856     # specific subdirectories exist in a greenstone installation.
    857     # None of those locations need exist when xpdf-tools is installed with GS.
    858     # So don't depend on GSDLARCH as forcing that to be exported has side-effects
    859     if($ENV{'BITNESS'}) {
    860         $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin".$ENV{'BITNESS'});
    861     } else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64?
    862         $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32");
    863     }
    864     }
     853    my $xpdf_pdftohtml = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftohtml");
    865854   
    866855    # We'll create the file by name $output_filestem during post-conversion processing.
     
    874863    = &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
    875864    $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
    876 
    877     $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "pdftohtml");
     865   
    878866    # xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
    879867    $cmd .= "\"$xpdf_pdftohtml\"";
     
    936924}
    937925
    938 
     926# Returns the path to xpdf-tools's containing bin dir appropriate for this machine's OS and bitness
     927sub _get_xpdftools_bindir {
     928
     929    # build up the path to the containing bin dir of the xpdf conversion tool we're going to use
     930    my $xpdf_tools_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools");
     931   
     932    if($ENV{'GSDLOS'} =~ m/^windows$/i) { # For Windows, just use the 32 bit xpdf's pdftohtml as it works the same as the 64 bit
     933        $xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin32");
     934    } else { # unix (linux|darwin), use the bin32/bin64 folder depending on the BITNESS env var
     935       
     936        # Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since $ENV{'GSDLARCH'}
     937        # isn't always set and has side-effects when it is set:
     938        # $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit
     939        # specific subdirectories exist in a greenstone installation.
     940        # None of those locations need exist when xpdf-tools is installed with GS.
     941        # So don't depend on GSDLARCH as forcing that to be exported has side-effects
     942        if($ENV{'BITNESS'}) {
     943            $xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin".$ENV{'BITNESS'});
     944        } else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64?
     945            $xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin32");
     946        }
     947    }
     948   
     949    return $xpdf_tools_bin;
     950}
    939951
    940952# Convert a pdf file to various types of image with the convert command
     
    10221034}
    10231035
     1036# Convert a PDF file to text with xpdftools' pdftotext command
     1037# Works for Windows too, whereas the old pdftotxt didn't
     1038sub xpdf_to_text {
     1039    my ($dirname, $input_filename, $output_filestem) = @_;
     1040
     1041    my $cmd = "";
     1042
     1043    # build up the path to the doc-to-txt conversion tool we're going to use
     1044    my $xpdf_pdftotxt = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftotext");
     1045   
     1046    # For xpdf's pdftotxt options, see https://www.xpdfreader.com/pdftotext-man.html
     1047    $cmd .= "\"$xpdf_pdftotxt\"";
     1048    if($enc) {
     1049        $cmd .= " -enc $enc"; # decode the bytes in the file using the designated encoding scheme
     1050    } else {
     1051        # as per https://www.xpdfreader.com/pdftotext-man.html
     1052        # xpdf's pdftotxt defaults to using Latin-1 encoding, should we default to UTF-8?
     1053        $cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html
     1054    }
     1055    $cmd .= " -nopgbrk";
     1056    # Avoid the silly solitary carriage returns (CR in Notepad) at the end
     1057    # of lines that ends up as \n appended to the doc title
     1058    # by setting the end of line marker to unix style solitary newline (LF or \n),
     1059    # which doesn't end up in the doc title
     1060    $cmd .= " -eol unix";
     1061    $cmd .= " \"$input_filename\" \"$output_filestem.text\"";   
     1062
     1063    print STDERR "@@@@ Running command: $cmd\n";
     1064   
     1065    return _run_pdf_to_text_cmd($cmd, $output_filestem);
     1066}
     1067
    10241068# Convert a PDF file to text with the pdftotext command
    10251069
     
    10281072
    10291073    my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
     1074   
     1075    return _run_pdf_to_text_cmd($cmd, $output_filestem);
     1076}
     1077
     1078sub _run_pdf_to_text_cmd {
     1079    my ($cmd, $output_filestem) = @_;
    10301080
    10311081    if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
  • main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm

    r32223 r32224  
    151151    # TODO: Start supporting PDF to txt on Windows if we're going to be using XPDF Tools (incl pdftotext) on Windows/Linux/Mac
    152152    if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) {
    153     print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
    154     $self->{'convert_to'} = "html";
     153        print STDERR "On Windows, Greenstone now uses Xpdf tools to support pdf to text conversion.\n";
     154    #print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n";
     155    #$self->{'convert_to'} = "html";
    155156    }
    156157    elsif ($self->{'convert_to'} eq "auto") {
Note: See TracChangeset for help on using the changeset viewer.