Changeset 32224

Show
Ignore:
Timestamp:
27.06.2018 18:30:21 (3 weeks ago)
Author:
ak19
Message:

Adding PDF to text support for Windows using Xpdf's pdftotext tool. Previously PDFPlugin would override the output mode to HTML on Windows if txt output mode was selected.

Location:
main/trunk/greenstone2
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/bin/script/gsConvert.pl

    r32223 r32224  
    6767my $pdf_allow_images_only; 
    6868my $windows_scripting; 
     69my $enc; 
    6970 
    7071sub print_usage 
     
    345346    # Attempt conversion to TEXT 
    346347    if (!$output_type || ($output_type =~ m/text/i)) { 
    347     $success = &pdf_to_text($dirname, $input_filename, $output_filestem); 
     348        if ($ENV{'GSDLOS'} =~ m/^windows$/i) { # we now have pdf to text support for windows by using xpdf tools 
     349            $success = &xpdf_to_text($dirname, $input_filename, $output_filestem); 
     350        } else { 
     351            $success = &pdf_to_text($dirname, $input_filename, $output_filestem); 
     352        } 
    348353    if ($success) { 
    349354        return "text"; 
     
    846851 
    847852    # build up the path to the doc-to-html conversion tool we're going to use 
    848     my $xpdf_pdftohtml = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools"); 
    849      
    850     if($ENV{'GSDLOS'} =~ m/^windows$/i) { # For Windows, just use the 32 bit xpdf's pdftohtml as it works the same as the 64 bit 
    851     $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32"); 
    852     } else { # unix (linux|darwin), use the bin32/bin64 folder depending on the BITNESS env var 
    853      
    854     # Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since 
    855     # $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit 
    856     # specific subdirectories exist in a greenstone installation. 
    857     # None of those locations need exist when xpdf-tools is installed with GS. 
    858     # So don't depend on GSDLARCH as forcing that to be exported has side-effects 
    859     if($ENV{'BITNESS'}) { 
    860         $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin".$ENV{'BITNESS'}); 
    861     } else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64? 
    862         $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32"); 
    863     } 
    864     } 
     853    my $xpdf_pdftohtml = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftohtml"); 
    865854     
    866855    # We'll create the file by name $output_filestem during post-conversion processing. 
     
    874863    = &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$"); 
    875864    $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages"); 
    876  
    877     $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "pdftohtml"); 
     865     
    878866    # xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100% 
    879867    $cmd .= "\"$xpdf_pdftohtml\""; 
     
    936924} 
    937925 
    938  
     926# Returns the path to xpdf-tools's containing bin dir appropriate for this machine's OS and bitness 
     927sub _get_xpdftools_bindir { 
     928 
     929    # build up the path to the containing bin dir of the xpdf conversion tool we're going to use 
     930    my $xpdf_tools_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools"); 
     931     
     932    if($ENV{'GSDLOS'} =~ m/^windows$/i) { # For Windows, just use the 32 bit xpdf's pdftohtml as it works the same as the 64 bit 
     933        $xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin32"); 
     934    } else { # unix (linux|darwin), use the bin32/bin64 folder depending on the BITNESS env var 
     935         
     936        # Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since $ENV{'GSDLARCH'} 
     937        # isn't always set and has side-effects when it is set: 
     938        # $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit 
     939        # specific subdirectories exist in a greenstone installation. 
     940        # None of those locations need exist when xpdf-tools is installed with GS. 
     941        # So don't depend on GSDLARCH as forcing that to be exported has side-effects 
     942        if($ENV{'BITNESS'}) { 
     943            $xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin".$ENV{'BITNESS'}); 
     944        } else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64? 
     945            $xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin32"); 
     946        } 
     947    } 
     948     
     949    return $xpdf_tools_bin; 
     950} 
    939951 
    940952# Convert a pdf file to various types of image with the convert command 
     
    10221034} 
    10231035 
     1036# Convert a PDF file to text with xpdftools' pdftotext command 
     1037# Works for Windows too, whereas the old pdftotxt didn't 
     1038sub xpdf_to_text { 
     1039    my ($dirname, $input_filename, $output_filestem) = @_; 
     1040 
     1041    my $cmd = ""; 
     1042 
     1043    # build up the path to the doc-to-txt conversion tool we're going to use 
     1044    my $xpdf_pdftotxt = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftotext"); 
     1045     
     1046    # For xpdf's pdftotxt options, see https://www.xpdfreader.com/pdftotext-man.html 
     1047    $cmd .= "\"$xpdf_pdftotxt\""; 
     1048    if($enc) { 
     1049        $cmd .= " -enc $enc"; # decode the bytes in the file using the designated encoding scheme 
     1050    } else { 
     1051        # as per https://www.xpdfreader.com/pdftotext-man.html 
     1052        # xpdf's pdftotxt defaults to using Latin-1 encoding, should we default to UTF-8? 
     1053        $cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html 
     1054    } 
     1055    $cmd .= " -nopgbrk"; 
     1056    # Avoid the silly solitary carriage returns (CR in Notepad) at the end 
     1057    # of lines that ends up as \n appended to the doc title 
     1058    # by setting the end of line marker to unix style solitary newline (LF or \n), 
     1059    # which doesn't end up in the doc title 
     1060    $cmd .= " -eol unix"; 
     1061    $cmd .= " \"$input_filename\" \"$output_filestem.text\"";     
     1062 
     1063    print STDERR "@@@@ Running command: $cmd\n"; 
     1064     
     1065    return _run_pdf_to_text_cmd($cmd, $output_filestem); 
     1066} 
     1067 
    10241068# Convert a PDF file to text with the pdftotext command 
    10251069 
     
    10281072 
    10291073    my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\""; 
     1074     
     1075    return _run_pdf_to_text_cmd($cmd, $output_filestem); 
     1076} 
     1077 
     1078sub _run_pdf_to_text_cmd { 
     1079    my ($cmd, $output_filestem) = @_; 
    10301080 
    10311081    if ($ENV{'GSDLOS'} !~ m/^windows$/i) { 
  • main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm

    r32223 r32224  
    151151    # TODO: Start supporting PDF to txt on Windows if we're going to be using XPDF Tools (incl pdftotext) on Windows/Linux/Mac 
    152152    if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) { 
    153     print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n"; 
    154     $self->{'convert_to'} = "html"; 
     153        print STDERR "On Windows, Greenstone now uses Xpdf tools to support pdf to text conversion.\n"; 
     154    #print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n"; 
     155    #$self->{'convert_to'} = "html"; 
    155156    } 
    156157    elsif ($self->{'convert_to'} eq "auto") {