Changeset 32224 for main/trunk
- Timestamp:
- 2018-06-27T18:30:21+12:00 (6 years ago)
- Location:
- main/trunk/greenstone2
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/bin/script/gsConvert.pl
r32223 r32224 67 67 my $pdf_allow_images_only; 68 68 my $windows_scripting; 69 my $enc; 69 70 70 71 sub print_usage … … 345 346 # Attempt conversion to TEXT 346 347 if (!$output_type || ($output_type =~ m/text/i)) { 347 $success = &pdf_to_text($dirname, $input_filename, $output_filestem); 348 if ($ENV{'GSDLOS'} =~ m/^windows$/i) { # we now have pdf to text support for windows by using xpdf tools 349 $success = &xpdf_to_text($dirname, $input_filename, $output_filestem); 350 } else { 351 $success = &pdf_to_text($dirname, $input_filename, $output_filestem); 352 } 348 353 if ($success) { 349 354 return "text"; … … 846 851 847 852 # build up the path to the doc-to-html conversion tool we're going to use 848 my $xpdf_pdftohtml = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools"); 849 850 if($ENV{'GSDLOS'} =~ m/^windows$/i) { # For Windows, just use the 32 bit xpdf's pdftohtml as it works the same as the 64 bit 851 $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32"); 852 } else { # unix (linux|darwin), use the bin32/bin64 folder depending on the BITNESS env var 853 854 # Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since 855 # $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit 856 # specific subdirectories exist in a greenstone installation. 857 # None of those locations need exist when xpdf-tools is installed with GS. 858 # So don't depend on GSDLARCH as forcing that to be exported has side-effects 859 if($ENV{'BITNESS'}) { 860 $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin".$ENV{'BITNESS'}); 861 } else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64? 862 $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32"); 863 } 864 } 853 my $xpdf_pdftohtml = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftohtml"); 865 854 866 855 # We'll create the file by name $output_filestem during post-conversion processing. … … 874 863 = &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$"); 875 864 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages"); 876 877 $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "pdftohtml"); 865 878 866 # xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100% 879 867 $cmd .= "\"$xpdf_pdftohtml\""; … … 936 924 } 937 925 938 926 # Returns the path to xpdf-tools's containing bin dir appropriate for this machine's OS and bitness 927 sub _get_xpdftools_bindir { 928 929 # build up the path to the containing bin dir of the xpdf conversion tool we're going to use 930 my $xpdf_tools_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools"); 931 932 if($ENV{'GSDLOS'} =~ m/^windows$/i) { # For Windows, just use the 32 bit xpdf's pdftohtml as it works the same as the 64 bit 933 $xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin32"); 934 } else { # unix (linux|darwin), use the bin32/bin64 folder depending on the BITNESS env var 935 936 # Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since $ENV{'GSDLARCH'} 937 # isn't always set and has side-effects when it is set: 938 # $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit 939 # specific subdirectories exist in a greenstone installation. 940 # None of those locations need exist when xpdf-tools is installed with GS. 941 # So don't depend on GSDLARCH as forcing that to be exported has side-effects 942 if($ENV{'BITNESS'}) { 943 $xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin".$ENV{'BITNESS'}); 944 } else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64? 945 $xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin32"); 946 } 947 } 948 949 return $xpdf_tools_bin; 950 } 939 951 940 952 # Convert a pdf file to various types of image with the convert command … … 1022 1034 } 1023 1035 1036 # Convert a PDF file to text with xpdftools' pdftotext command 1037 # Works for Windows too, whereas the old pdftotxt didn't 1038 sub xpdf_to_text { 1039 my ($dirname, $input_filename, $output_filestem) = @_; 1040 1041 my $cmd = ""; 1042 1043 # build up the path to the doc-to-txt conversion tool we're going to use 1044 my $xpdf_pdftotxt = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftotext"); 1045 1046 # For xpdf's pdftotxt options, see https://www.xpdfreader.com/pdftotext-man.html 1047 $cmd .= "\"$xpdf_pdftotxt\""; 1048 if($enc) { 1049 $cmd .= " -enc $enc"; # decode the bytes in the file using the designated encoding scheme 1050 } else { 1051 # as per https://www.xpdfreader.com/pdftotext-man.html 1052 # xpdf's pdftotxt defaults to using Latin-1 encoding, should we default to UTF-8? 1053 $cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html 1054 } 1055 $cmd .= " -nopgbrk"; 1056 # Avoid the silly solitary carriage returns (CR in Notepad) at the end 1057 # of lines that ends up as \n appended to the doc title 1058 # by setting the end of line marker to unix style solitary newline (LF or \n), 1059 # which doesn't end up in the doc title 1060 $cmd .= " -eol unix"; 1061 $cmd .= " \"$input_filename\" \"$output_filestem.text\""; 1062 1063 print STDERR "@@@@ Running command: $cmd\n"; 1064 1065 return _run_pdf_to_text_cmd($cmd, $output_filestem); 1066 } 1067 1024 1068 # Convert a PDF file to text with the pdftotext command 1025 1069 … … 1028 1072 1029 1073 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\""; 1074 1075 return _run_pdf_to_text_cmd($cmd, $output_filestem); 1076 } 1077 1078 sub _run_pdf_to_text_cmd { 1079 my ($cmd, $output_filestem) = @_; 1030 1080 1031 1081 if ($ENV{'GSDLOS'} !~ m/^windows$/i) { -
main/trunk/greenstone2/perllib/plugins/PDFPlugin.pm
r32223 r32224 151 151 # TODO: Start supporting PDF to txt on Windows if we're going to be using XPDF Tools (incl pdftotext) on Windows/Linux/Mac 152 152 if ($self->{'convert_to'} eq "text" && $ENV{'GSDLOS'} =~ /^windows$/i) { 153 print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n"; 154 $self->{'convert_to'} = "html"; 153 print STDERR "On Windows, Greenstone now uses Xpdf tools to support pdf to text conversion.\n"; 154 #print STDERR "Windows does not support pdf to text. PDFs will be converted to HTML instead\n"; 155 #$self->{'convert_to'} = "html"; 155 156 } 156 157 elsif ($self->{'convert_to'} eq "auto") {
Note:
See TracChangeset
for help on using the changeset viewer.