Changeset 22642
- Timestamp:
- 2010-08-17T12:24:42+12:00 (14 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/bin/script/gsConvert.pl
r22596 r22642 67 67 my $pdf_allow_images_only; 68 68 my $windows_scripting; 69 my $openoffice_scripting;70 69 71 70 sub print_usage … … 75 74 print STDERR " or text using third-party programs.\n\n"; 76 75 print STDERR " usage: $0 [options] filename\n"; 77 if ($openoffice_scripting) { 78 print STDERR " options:\n\t-type\tdoc|dot|docx|odf|pdf|ps|ppt|rtf|xls\t(input file type)\n"; 79 } 80 else { 81 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n"; 82 } 76 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n"; 83 77 print STDERR "\t-errlog\t<filename>\t(append err messages)\n"; 84 78 print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n"; … … 86 80 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n"; 87 81 print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n"; 88 print STDERR "\t-openoffice_scripting\tuse OpenOffice (if available) to convert Microsoft Office documents \n";89 82 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n"; 90 83 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n"; … … 106 99 107 100 108 # scan for -openoffice_scripting as it effects the permissible109 # values for -type110 111 foreach my $a (@ARGV) {112 if ($a =~ m/^-openoffice_scripting$/) {113 $openoffice_scripting = 1;114 last;115 }116 }117 118 my $parse_type;119 if ($openoffice_scripting) {120 $parse_type = 'type/(doc|dot|docx|odf|pdf|ps|ppt|rtf|xls)/';121 }122 else {123 $parse_type = 'type/(doc|dot|pdf|ps|ppt|rtf|xls)/';124 }125 126 101 # read command-line arguments 127 102 if (!parsargv::parse(\@ARGV, 128 $parse_type, \$input_type,103 'type/(doc|dot|pdf|ps|ppt|rtf|xls)/', \$input_type, 129 104 '/errlog/.*/', \$faillogfile, 130 105 'output/(auto|html|text|pagedimg).*/', \$output_type, … … 132 107 'verbose/\d+/0', \$verbose, 133 108 'windows_scripting',\$windows_scripting, 134 'openoffice_scripting',\$openoffice_scripting,135 109 'use_strings', \$use_strings, 136 110 'pdf_complex', \$pdf_complex, … … 173 147 print STDERR "Error: No filename extension or input type defined\n"; 174 148 exit(1); 175 }176 elsif ($openoffice_scripting && (($input_type eq "docx") || ($input_type eq "odf"))) {177 print &convertDOC($input_filename, $output_filestem, $output_type);178 print "\n";179 149 } 180 150 elsif ($input_type eq "doc" || $input_type eq "dot") { … … 230 200 my ($input_filename, $output_filestem, $output_type) = @_; 231 201 232 if (($openoffice_scripting) && ($input_filename =~ m/\.docx?$/i)) {233 # Jump right in and process with Open Office234 if (openoffice_doc_to_html($input_filename, $output_filestem)) {235 return "html";236 }237 else {238 return "fail";239 }240 }241 242 202 # Many .doc files are not in fact word documents! 243 203 my $realtype = &find_docfile_type($input_filename); … … 262 222 $success = &native_doc_to_html($input_filename, $output_filestem); 263 223 } 264 elsif ($openoffice_scripting) {265 $success = &openoffice_doc_to_html($input_filename, $output_filestem);266 }267 224 else { 268 225 $success = &doc_to_html($input_filename, $output_filestem); … … 288 245 if ($windows_scripting) { 289 246 $success = &native_doc_to_html($input_filename, $output_filestem); 290 }291 elsif ($openoffice_scripting) {292 $success = &openoffice_doc_to_html($input_filename, $output_filestem);293 247 } 294 248 else { … … 402 356 my $ppt_convert_type = ""; 403 357 404 if ($openoffice_scripting) {405 # Jump right in and process with Open Office406 if (openoffice_doc_to_html($input_filename, $output_filestem)) {407 return "html";408 }409 else {410 return "fail";411 }412 }413 414 358 #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){ 415 359 if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){ … … 475 419 my $success = 0; 476 420 477 if ($openoffice_scripting) {478 # Jump right in and process with Open Office479 if (openoffice_doc_to_html($input_filename, $output_filestem)) {480 return "html";481 }482 else {483 return "fail";484 }485 }486 487 421 # Attempt conversion to HTML 488 422 if (!$output_type || ($output_type =~ m/html/i)) { … … 897 831 } 898 832 899 # Attempt to convert a word document to html with JODConvert scripting program900 sub openoffice_doc_to_html {901 my ($input_filename, $output_filestem) = @_;902 903 if (-e "$output_filestem.html") {904 print STDERR " The conversion file:\n";905 print STDERR " $output_filestem.html\n";906 print STDERR " ... skipping\n";907 return 1;908 }909 910 my $oo_script_dir = &util::filename_cat($ENV{'GEXT_OPENOFFICE'}, "bin", "script");911 my $oo2html = &util::filename_cat($oo_script_dir,"oo2html");912 if (!-e $oo2html) {913 print STDERR "Error: Unable to find 'oo2html' in: \n";914 print STDERR " $oo_script_dir\n";915 print STDERR " Is the OpenOffice extension to Greenstone installed?\n";916 return 0;917 }918 919 my $cmd = "";920 if ($timeout) {$cmd = "ulimit -t $timeout;";}921 $cmd .= "$oo2html \"$input_filename\" \"$output_filestem.html\"";922 923 # redirecting STDERR924 $cmd .= " 2> \"$output_filestem.err\""925 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);926 927 # execute the command928 $!=0;929 if (system($cmd)!=0)930 {931 print STDERR "Error executing oo2html converter: $!\n";932 print STDERR "Command was: $cmd\n";933 934 if (-s "$output_filestem.err") {935 open (ERRFILE, "<$output_filestem.err");936 937 my $write_to_fail_log=0;938 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))939 {$write_to_fail_log=1;}940 941 my $line;942 while ($line=<ERRFILE>) {943 if ($line =~ m/\w/) {944 print STDERR "$line";945 print FAILLOG "$line" if ($write_to_fail_log);946 }947 if ($line !~ m/startup error/) {next;}948 print STDERR " (given an invalid .DOC file?)\n";949 print FAILLOG " (given an invalid .DOC file?)\n"950 if ($write_to_fail_log);951 952 } # while ERRFILE953 close FAILLOG if ($write_to_fail_log);954 }955 return 0; # we can try any_to_text956 }957 958 # Was the conversion successful?959 if (-s "$output_filestem.html") {960 open(TMP, "$output_filestem.html");961 my $line = <TMP>;962 close(TMP);963 if ($line && $line =~ m/html/i) {964 &util::rm("$output_filestem.err") if -e "$output_filestem.err";965 return 1;966 }967 }968 969 # If here, an error of some sort occurred970 971 &util::rm("$output_filestem.html") if -e "$output_filestem.html";972 if (-e "$output_filestem.err") {973 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {974 open (ERRLOG,"$output_filestem.err");975 while (<ERRLOG>) {print FAILLOG $_;}976 close FAILLOG;977 close ERRLOG;978 }979 &util::rm("$output_filestem.err");980 }981 return 0;982 }983 984 833 # Attempt to convert an RTF document to html with rtftohtml 985 834 sub rtf_to_html {
Note:
See TracChangeset
for help on using the changeset viewer.