Changeset 32205 for main/trunk/greenstone2/bin/script/gsConvert.pl
- Timestamp:
- 2018-06-21T21:41:12+12:00 (6 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/bin/script/gsConvert.pl
r30724 r32205 323 323 324 324 # Attempt conversion to HTML 325 if (!$output_type || ($output_type =~ m/html/i)) { 325 # Uses the old pdftohtml that doesn't work for newer PDF versions 326 #if ($output_type =~ m/^html/i) { 327 if (!$output_type || ($output_type =~ m/^html/i)) { 326 328 $success = &pdf_to_html($dirname, $input_filename, $output_filestem); 327 329 if ($success) { 328 330 return "html"; 331 } 332 } 333 334 # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This 335 # will be the new default for PDFs when output_type for PDF docs is not specified 336 # (once our use of xpdftools' pdftohtml has been implemented on win and mac). 337 if ($output_type =~ m/paged_html/i) { 338 #if (!$output_type || ($output_type =~ m/paged_html/i)) { 339 $success = &xpdf_to_html($dirname, $input_filename, $output_filestem); 340 if ($success) { 341 return "paged_html"; 329 342 } 330 343 } … … 756 769 757 770 758 # Convert a pdf file to html with the pdftohtml command759 771 # Convert a pdf file to html with the old pdftohtml command 772 # which only works for older PDF versions 760 773 sub pdf_to_html { 761 774 my ($dirname, $input_filename, $output_filestem) = @_; … … 819 832 return 1; 820 833 } 834 835 836 # Convert a pdf file to html with the newer Xpdftools' pdftohtml 837 # This generates "paged HTML" where extracted, selectable text is positioned 838 # over screenshots of each page. 839 # Since xpdf's pdftohtml fails if the output dir already exists and for easier 840 # naming, the output files are created in a "pages" subdirectory of the tmp 841 # location parent of $output_filestem instead 842 sub xpdf_to_html { 843 my ($dirname, $input_filename, $output_filestem) = @_; 844 845 my $cmd = ""; 846 847 # build up the path to the doc-to-html conversion tool we're going to use 848 my $xpdf_pdftohtml = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools"); 849 850 if ($ENV{'GSDLOS'} =~ m/^windows$/i) { 851 # TODO 852 } elsif ($ENV{'GSDLOS'} =~ m/^darwin$/i) { 853 # TODO 854 } else { # unix, use the appropriate bin folder for the bitness of the system 855 856 # Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since 857 # $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit 858 # specific subdirectories exist in a greenstone installation. 859 # None of those locations need exist when xpdf-tools is installed with GS. 860 # So don't depend on GSDLARCH as forcing that to be exported has side-effects 861 if($ENV{'BITNESS'}) { 862 $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin".$ENV{'BITNESS'}); 863 } else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64? 864 $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32"); 865 } 866 } 867 868 # We'll create the file by name $output_filestem during post-conversion processing. 869 # Note that Xpdf tools will only create its conversion products in a dir that does 870 # not yet exist. So we'll create this location as a subdir of the output_filestem's 871 # parent directory. The parent dir is the already generated tmp area for conversion. So: 872 # - tmpdir gs2build/tmp/<random-num> already exists at this stage 873 # - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing 874 # - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there. 875 my ($tailname, $tmp_dirname, $suffix) 876 = &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$"); 877 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages"); 878 879 $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "pdftohtml"); 880 # xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100% 881 $cmd .= "\"$xpdf_pdftohtml\""; 882 $cmd .= " -z $pdf_zoom" if ($pdf_zoom); 883 # $cmd .= " -c" if ($pdf_complex); 884 # $cmd .= " -i" if ($pdf_ignore_images); 885 # $cmd .= " -a" if ($pdf_allow_images_only); 886 # $cmd .= " -hidden" unless ($pdf_nohidden); 887 $cmd .= " \"$input_filename\" \"$tmp_dirname\""; 888 #$cmd .= " \"$input_filename\" \"$output_filestem\""; 889 890 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) { 891 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\""; 892 } else { 893 $cmd .= " > \"$output_filestem.err\""; 894 } 895 896 #print STDERR "@@@@ Running command: $cmd\n"; 897 898 $!=0; 899 my $retval=system($cmd); 900 if ($retval!=0) 901 { 902 print STDERR "Error executing xpdf's pdftohtml tool"; 903 if ($!) {print STDERR ": $!";} 904 print STDERR "\n"; 905 } 906 907 # make sure the converter made something 908 if ($retval!=0 || ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html")) 909 { 910 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out"); 911 # print out the converter's std err, if any 912 if (-s "$output_filestem.err") { 913 open (ERRLOG, "$output_filestem.err") || die "$!"; 914 print STDERR "pdftohtml error log:\n"; 915 while (<ERRLOG>) { 916 print STDERR "$_"; 917 } 918 close ERRLOG; 919 } 920 #print STDERR "***********output filestem $output_filestem.html\n"; 921 &FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname"); 922 if (-e "$output_filestem.err") { 923 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) 924 { 925 open (ERRLOG, "$output_filestem.err"); 926 while (<ERRLOG>) {print FAILLOG $_;} 927 close ERRLOG; 928 close FAILLOG; 929 } 930 &FileUtils::removeFiles("$output_filestem.err"); 931 } 932 return 0; 933 } 934 935 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err"); 936 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out"); 937 return 1; 938 } 939 940 821 941 822 942 # Convert a pdf file to various types of image with the convert command
Note:
See TracChangeset
for help on using the changeset viewer.