Changeset 22429 for main/trunk/greenstone2/bin
- Timestamp:
- 2010-07-19T13:28:14+12:00 (14 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/bin/script/gsConvert.pl
r20933 r22429 49 49 } 50 50 51 use strict; 52 51 53 use parsargv; 52 54 use util; … … 65 67 my $pdf_allow_images_only; 66 68 my $windows_scripting; 69 my $openoffice_scripting; 67 70 68 71 sub print_usage … … 72 75 print STDERR " or text using third-party programs.\n\n"; 73 76 print STDERR " usage: $0 [options] filename\n"; 74 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n"; 77 if ($openoffice_scripting) { 78 print STDERR " options:\n\t-type\tdoc|dot|docx|odf|pdf|ps|ppt|rtf|xls\t(input file type)\n"; 79 } 80 else { 81 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n"; 82 } 75 83 print STDERR "\t-errlog\t<filename>\t(append err messages)\n"; 76 84 print STDERR "\t-output\tauto|html|text|pagedimage_jpg|pagedimage_gif|pagedimage_png\t(output file type)\n"; 77 85 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n"; 78 86 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n"; 79 print STDERR "\t-windows_scripting\tuse windows script when converting Microsoft Word and PPT via VB script\n"; 87 print STDERR "\t-windows_scripting\tuse windows script (if available) when converting Microsoft Word and PPT via VB script\n"; 88 print STDERR "\t-openoffice_scripting\tuse openoffice script (if available) when converting Microsoft Word and PPT via OpenOffice\n"; 80 89 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n"; 81 90 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n"; … … 96 105 my ($input_type,$output_type,$verbose); 97 106 107 108 # scan for -openoffice_scripting as it effects the permissible 109 # values for -type 110 111 foreach my $a (@ARGV) { 112 if ($a =~ m/^-openoffice_scripting$/) { 113 $openoffice_scripting = 1; 114 last; 115 } 116 } 117 118 my $parse_type; 119 if ($openoffice_scripting) { 120 $parse_type = 'type/(doc|dot|docx|odf|pdf|ps|ppt|rtf|xls)/'; 121 } 122 else { 123 $parse_type = 'type/(doc|dot|pdf|ps|ppt|rtf|xls)/'; 124 } 125 98 126 # read command-line arguments 99 127 if (!parsargv::parse(\@ARGV, 100 'type/(doc|dot|pdf|ps|ppt|rtf|xls)/', \$input_type,128 $parse_type, \$input_type, 101 129 '/errlog/.*/', \$faillogfile, 102 130 'output/(auto|html|text|pagedimage).*/', \$output_type, 103 131 'timeout/\d+/0',\$timeout, 104 132 'verbose/\d+/0', \$verbose, 133 'windows_scripting',\$windows_scripting, 134 'openoffice_scripting',\$openoffice_scripting, 105 135 'use_strings', \$use_strings, 106 'windows_scripting',\$windows_scripting,107 136 'pdf_complex', \$pdf_complex, 108 137 'pdf_ignore_images', \$pdf_ignore_images, … … 144 173 print STDERR "Error: No filename extension or input type defined\n"; 145 174 exit(1); 175 } 176 elsif ($openoffice_scripting && (($input_type eq "docx") || ($input_type eq "odf"))) { 177 print &convertDOC($input_filename, $output_filestem, $output_type); 178 print "\n"; 146 179 } 147 180 elsif ($input_type eq "doc" || $input_type eq "dot") { … … 158 191 } 159 192 elsif ($input_type eq "ps") { 160 print &convertPS($ input_filename, $output_filestem, $output_type);193 print &convertPS($dirname, $input_filename, $output_filestem, $output_type); 161 194 print "\n"; 162 195 } … … 195 228 196 229 sub convertDOC { 197 ($input_filename, $output_filestem, $output_type) = @_; 230 my ($input_filename, $output_filestem, $output_type) = @_; 231 232 if (($openoffice_scripting) && ($input_filename =~ m/\.docx?$/i)) { 233 # Jump right in and process with Open Office 234 if (openoffice_doc_to_html($input_filename, $output_filestem)) { 235 return "html"; 236 } 237 else { 238 return "fail"; 239 } 240 } 198 241 199 242 # Many .doc files are not in fact word documents! … … 212 255 213 256 sub convertWord678 { 214 ($input_filename, $output_filestem, $output_type) = @_;257 my ($input_filename, $output_filestem, $output_type) = @_; 215 258 216 259 my $success = 0; … … 219 262 $success = &native_doc_to_html($input_filename, $output_filestem); 220 263 } 264 elsif ($openoffice_scripting) { 265 $success = &openoffice_doc_to_html($input_filename, $output_filestem); 266 } 221 267 else { 222 268 $success = &doc_to_html($input_filename, $output_filestem); … … 233 279 234 280 sub convertRTF { 235 ($input_filename, $output_filestem, $output_type) = @_;281 my ($input_filename, $output_filestem, $output_type) = @_; 236 282 237 283 my $success = 0; … … 242 288 if ($windows_scripting) { 243 289 $success = &native_doc_to_html($input_filename, $output_filestem); 290 } 291 elsif ($openoffice_scripting) { 292 $success = &openoffice_doc_to_html($input_filename, $output_filestem); 244 293 } 245 294 else { … … 261 310 262 311 sub convertAnything { 263 ($input_filename, $output_filestem, $output_type) = @_;312 my ($input_filename, $output_filestem, $output_type) = @_; 264 313 265 314 my $success = 0; … … 324 373 325 374 sub convertPS { 326 ($input_filename, $output_filestem, $output_type) = @_;375 my ($dirname,$input_filename, $output_filestem, $output_type) = @_; 327 376 328 377 my $success = 0; … … 365 414 $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); 366 415 367 $cmd = "";416 my $cmd = ""; 368 417 if ($timeout) {$cmd = "ulimit -t $timeout;";} 369 # if the converting directory has already existed418 # if the converting directory already exists 370 419 if (-d $output_filestem) { 371 print STDERR "**The conversion directory has existed\n";420 print STDERR "**The conversion directory already exists\n"; 372 421 return "item"; 373 422 } else { … … 385 434 #if (!$output_type || ($output_type =~ m/html/i)) { 386 435 # formulate the command 387 $cmd = "";436 my $cmd = ""; 388 437 $cmd .= "perl -S ppttohtml.pl "; 389 438 $cmd .= " \"$input_filename\" \"$output_filestem.html\""; … … 418 467 if (!$output_type || ($output_type =~ m/html/i)) { 419 468 # formulate the command 420 $cmd = "";469 my $cmd = ""; 421 470 $cmd .= "perl -S xlstohtml.pl "; 422 471 $cmd .= " \"$input_filename\" \"$output_filestem.html\""; … … 450 499 # files or Word 5 files. This function attempts to tell the difference. 451 500 sub find_docfile_type { 452 ($input_filename) = @_;501 my ($input_filename) = @_; 453 502 454 503 open(CHK, "<$input_filename"); … … 492 541 # Attempt to convert a word document to html with the wv program 493 542 sub doc_to_html { 494 ($input_filename, $output_filestem) = @_;543 my ($input_filename, $output_filestem) = @_; 495 544 496 545 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare"); … … 612 661 if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents) 613 662 open(TMP, "$output_filestem.html"); 614 $line = <TMP>;663 my $line = <TMP>; 615 664 close(TMP); 616 665 if ($line && $line =~ m/DOCTYPE HTML/) { … … 750 799 # Attempt to convert a word document to html with the word2html scripting program 751 800 sub native_doc_to_html { 752 ($input_filename, $output_filestem) = @_;801 my ($input_filename, $output_filestem) = @_; 753 802 754 803 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", … … 757 806 $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i); 758 807 if (-e "$output_filestem.html") { 759 print STDERR "*** The conversion file has existed\n"; 808 print STDERR " The conversion file:\n"; 809 print STDERR " $output_filestem.html\n"; 810 print STDERR " ... already exists. Skipping\n"; 760 811 return 1; 761 812 } … … 803 854 if (-s "$output_filestem.html") { 804 855 open(TMP, "$output_filestem.html"); 805 $line = <TMP>;856 my $line = <TMP>; 806 857 close(TMP); 807 if ($line && $line =~ m/html/ ) {858 if ($line && $line =~ m/html/i) { 808 859 &util::rm("$output_filestem.err") if -e "$output_filestem.err"; 809 860 return 1; … … 825 876 } 826 877 878 # Attempt to convert a word document to html with JODConvert scripting program 879 sub openoffice_doc_to_html { 880 my ($input_filename, $output_filestem) = @_; 881 882 if (-e "$output_filestem.html") { 883 print STDERR " The conversion file:\n"; 884 print STDERR " $output_filestem.html\n"; 885 print STDERR " ... skipping\n"; 886 return 1; 887 } 888 889 my $oo_script_dir = &util::filename_cat($ENV{'GEXT_OPENOFFICE'}, "bin", "script"); 890 my $oo2html = &util::filename_cat($oo_script_dir,"oo2html"); 891 if (!-e $oo2html) { 892 print STDERR "Error: Unable to find 'oo2html' in: \n"; 893 print STDERR " $oo_script_dir\n"; 894 print STDERR " Is the OpenOffice extension to Greenstone installed?\n"; 895 return 0; 896 } 897 898 my $cmd = ""; 899 if ($timeout) {$cmd = "ulimit -t $timeout;";} 900 $cmd .= "$oo2html \"$input_filename\" \"$output_filestem.html\""; 901 902 # redirecting STDERR 903 $cmd .= " 2> \"$output_filestem.err\"" 904 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000); 905 906 # execute the command 907 $!=0; 908 if (system($cmd)!=0) 909 { 910 print STDERR "Error executing oo2html converter: $!\n"; 911 print STDERR "Command was: $cmd\n"; 912 913 if (-s "$output_filestem.err") { 914 open (ERRFILE, "<$output_filestem.err"); 915 916 my $write_to_fail_log=0; 917 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) 918 {$write_to_fail_log=1;} 919 920 my $line; 921 while ($line=<ERRFILE>) { 922 if ($line =~ m/\w/) { 923 print STDERR "$line"; 924 print FAILLOG "$line" if ($write_to_fail_log); 925 } 926 if ($line !~ m/startup error/) {next;} 927 print STDERR " (given an invalid .DOC file?)\n"; 928 print FAILLOG " (given an invalid .DOC file?)\n" 929 if ($write_to_fail_log); 930 931 } # while ERRFILE 932 close FAILLOG if ($write_to_fail_log); 933 } 934 return 0; # we can try any_to_text 935 } 936 937 # Was the conversion successful? 938 if (-s "$output_filestem.html") { 939 open(TMP, "$output_filestem.html"); 940 my $line = <TMP>; 941 close(TMP); 942 if ($line && $line =~ m/html/i) { 943 &util::rm("$output_filestem.err") if -e "$output_filestem.err"; 944 return 1; 945 } 946 } 947 948 # If here, an error of some sort occurred 949 950 &util::rm("$output_filestem.html") if -e "$output_filestem.html"; 951 if (-e "$output_filestem.err") { 952 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) { 953 open (ERRLOG,"$output_filestem.err"); 954 while (<ERRLOG>) {print FAILLOG $_;} 955 close FAILLOG; 956 close ERRLOG; 957 } 958 &util::rm("$output_filestem.err"); 959 } 960 return 0; 961 } 962 827 963 # Attempt to convert an RTF document to html with rtftohtml 828 829 964 sub rtf_to_html { 830 965 my ($input_filename, $output_filestem) = @_; 831 966 832 967 # formulate the command 833 $cmd = "";968 my $cmd = ""; 834 969 if ($timeout) {$cmd = "ulimit -t $timeout;";} 835 970 $cmd .= "rtftohtml"; … … 903 1038 my $line; 904 1039 while ($line=<TOC>) { 905 $line =~ s@</body></html>$@@ ; # only last line has this1040 $line =~ s@</body></html>$@@i ; # only last line has this 906 1041 # make link relative 907 $line =~ s@href=\"[^\#]+@href=\"@ ;1042 $line =~ s@href=\"[^\#]+@href=\"@i; 908 1043 print HTML $line; 909 1044 } … … 949 1084 my ($dirname, $input_filename, $output_filestem) = @_; 950 1085 951 $cmd = "";1086 my $cmd = ""; 952 1087 if ($timeout) {$cmd = "ulimit -t $timeout;";} 953 1088 $cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom"; … … 1021 1156 } 1022 1157 1023 $cmd = "";1158 my $cmd = ""; 1024 1159 if ($timeout) {$cmd = "ulimit -t $timeout;";} 1025 1160 $output_type =~ s/.*\_(.*)/$1/i; … … 1316 1451 1317 1452 sub any_to_html { 1318 ($input_filename, $output_filestem) = @_;1453 my ($input_filename, $output_filestem) = @_; 1319 1454 1320 1455 # First generate a text file … … 1354 1489 1355 1490 sub any_to_text { 1356 ($input_filename, $output_filestem) = @_;1491 my ($input_filename, $output_filestem) = @_; 1357 1492 1358 1493 if (!$use_strings) {
Note:
See TracChangeset
for help on using the changeset viewer.