Changeset 10282
- Timestamp:
- 2005-07-25T14:27:31+12:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/bin/script/gsConvert.pl
r9482 r10282 63 63 my $pdf_zoom; 64 64 my $pdf_ignore_images; 65 my $windows_scripting; 65 66 66 67 sub print_usage … … 72 73 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n"; 73 74 print STDERR "\t-errlog\t<filename>\t(append err messages)\n"; 74 print STDERR "\t-output\t html|text\n";75 print STDERR "\t-output\tauto|html|text|pagedimg-jpg|pagedimg-gif|pagedimg-png\t(output file type)\n"; 75 76 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n"; 76 77 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n"; 78 print STDERR "\t-windows_scripting\tuse windows script when converting Microsoft Word and PPT via VB script\n"; 77 79 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n"; 78 80 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n"; … … 96 98 'type/(doc|dot|pdf|ps|ppt|rtf|xls)/', \$input_type, 97 99 '/errlog/.*/', \$faillogfile, 98 'output/( html|text)/', \$output_type,100 'output/(auto|html|text|pagedimg).*/', \$output_type, 99 101 'timeout/\d+/0',\$timeout, 100 'verbose/\d+/0', 102 'verbose/\d+/0', \$verbose, 101 103 'use_strings', \$use_strings, 104 'windows_scripting',\$windows_scripting, 102 105 'pdf_complex', \$pdf_complex, 103 106 'pdf_ignore_images', \$pdf_ignore_images, … … 108 111 print_usage(); 109 112 } 110 113 114 111 115 # Make sure the input file exists and can be opened for reading 112 116 if (scalar(@ARGV!=1)) { … … 133 137 my $stored_dir = cwd(); 134 138 chdir ($dirname) || die "Unable to change to directory $dirname"; 135 136 139 # Select convert utility 137 140 if (!defined $input_type) { … … 209 212 210 213 my $success = 0; 214 if (!$output_type || ($output_type =~ /html/i)){ 215 if ($windows_scripting) { 216 print STDERR "***** Calling VB Script!\n"; 217 $success = &native_doc_to_html($input_filename, $output_filestem); 218 } 219 else { 220 print STDERR "**** Calling wvWare\n"; 221 $success = &doc_to_html($input_filename, $output_filestem); 222 } 223 if ($success) { 224 return "html"; 225 } 226 } 211 227 212 228 # Attempt specialised conversion to HTML 213 if (!$output_type || ($output_type =~ /html/i)) {214 $success = &doc_to_html($input_filename, $output_filestem);215 if ($success) {216 return "html";217 }218 }229 #if (!$output_type || ($output_type =~ /html/i)) { 230 # $success = &doc_to_html($input_filename, $output_filestem); 231 # if ($success) { 232 # return "html"; 233 # } 234 # } 219 235 220 236 return &convertAnything($input_filename, $output_filestem, $output_type); … … 313 329 } 314 330 } 315 316 331 return "fail"; 317 318 332 } 319 333 … … 323 337 324 338 my $success = 0; 325 326 # Attempt conversion to HTML 327 if (!$output_type || ($output_type =~ /html/i)) { 339 my $ppt_convert_type = ""; 340 if (!$output_type || $windows_scripting ||($output_type !~ /html/i)){ 341 if ($output_type =~ /gif/i) { 342 $ppt_convert_type = "-g"; 343 } elsif ($output_type =~ /jp?g/i){ 344 $ppt_convert_type = "-j"; 345 } elsif ($output_type =~ /png/i){ 346 $ppt_convert_type = "-p"; 347 } 348 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", 349 $ENV{'GSDLOS'}, "pptextract"); 350 $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ /^windows$/i); 351 352 $cmd = ""; 353 #if ($timeout) {$cmd = "ulimit -t $timeout;";} 354 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\""; 355 #$cmd .= "$vbScript $input_filename $output_filestem.html"; 356 # if the converting directory has already existed 357 if (-d $output_filestem) { 358 print STDERR "**The conversion directory has existed\n"; 359 return "item"; 360 } else { 361 $cmd .= "$vbScript $ppt_convert_type $input_filename $output_filestem"; 362 $cmd .= " 2>\"$output_filestem.err\"" 363 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000); 364 if (system($cmd) !=0) { 365 print STDERR "Powerpoint VB Scripting convert failed\n"; 366 } else { 367 return "item"; 368 } 369 } 370 } else { 371 # Attempt conversion to HTML 372 #if (!$output_type || ($output_type =~ /html/i)) { 328 373 # formulate the command 329 374 $cmd = ""; … … 332 377 $cmd .= " 2>\"$output_filestem.err\"" 333 378 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000); 334 335 379 336 380 # execute the command … … 388 432 389 433 390 391 392 434 # Find the real type of a .doc file 393 435 # 394 436 # We seem to have a lot of files with a .doc extension that are .rtf 395 437 # files or Word 5 files. This function attempts to tell the difference. 396 397 438 sub find_docfile_type { 398 439 ($input_filename) = @_; … … 428 469 429 470 430 431 471 # Specific type-to-type conversions 432 472 # … … 438 478 439 479 # Attempt to convert a word document to html with the wv program 440 441 480 sub doc_to_html { 442 481 ($input_filename, $output_filestem) = @_; … … 461 500 $cmd .= " 2> \"$output_filestem.err\"" 462 501 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000); 463 502 464 503 # execute the command 465 504 $!=0; … … 518 557 } 519 558 559 # Attempt to convert a word document to html with the word2html scripting program 560 sub native_doc_to_html { 561 ($input_filename, $output_filestem) = @_; 562 563 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", 564 $ENV{'GSDLOS'}, "word2html"); 565 566 $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ /^windows$/i); 567 568 my $cmd = ""; 569 if ($timeout) {$cmd = "ulimit -t $timeout;";} 570 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\""; 571 $cmd .= "$vbScript $input_filename $output_filestem.html"; 572 573 # redirecting STDERR 574 $cmd .= " 2> \"$output_filestem.err\"" 575 if ($ENV {'GSDLOS'} !~ /^windows$/i || $is_winnt_2000); 576 577 # execute the command 578 $!=0; 579 if (system($cmd)!=0) 580 { 581 print STDERR "Error executing word2Html converter:$!\n"; 582 if (-s "$output_filestem.err") { 583 open (ERRFILE, "<$output_filestem.err"); 584 585 my $write_to_fail_log=0; 586 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) 587 {$write_to_fail_log=1;} 588 589 my $line; 590 while ($line=<ERRFILE>) { 591 if ($line =~ /\w/) { 592 print STDERR "$line"; 593 print FAILLOG "$line" if ($write_to_fail_log); 594 } 595 if ($line !~ m/startup error/) {next;} 596 print STDERR " (given an invalid .DOC file?)\n"; 597 print FAILLOG " (given an invalid .DOC file?)\n" 598 if ($write_to_fail_log); 599 600 } # while ERRFILE 601 close FAILLOG if ($write_to_fail_log); 602 } 603 return 0; # we can try any_to_text 604 } 605 606 # Was the conversion successful? 607 if (-s "$output_filestem.html") { 608 open(TMP, "$output_filestem.html"); 609 $line = <TMP>; 610 close(TMP); 611 if ($line && $line =~ /html/) { 612 &util::rm("$output_filestem.err") if -e "$output_filestem.err"; 613 return 1; 614 } 615 } 616 617 # If here, an error of some sort occurred 618 &util::rm("$output_filestem.html") if -e "$output_filestem.html"; 619 if (-e "$output_filestem.err") { 620 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) { 621 open (ERRLOG,"$output_filestem.err"); 622 while (<ERRLOG>) {print FAILLOG $_;} 623 close FAILLOG; 624 close ERRLOG; 625 } 626 &util::rm("$output_filestem.err"); 627 } 628 return 0; 629 } 630 631 520 632 521 633 # Attempt to convert an RTF document to html with rtftohtml … … 528 640 if ($timeout) {$cmd = "ulimit -t $timeout;";} 529 641 $cmd .= "rtftohtml"; 642 #$cmd .= "rtf-converter"; 530 643 531 644 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\""; … … 621 734 { 622 735 print FAILLOG "Error - rtftohtml - couldn't extract text\n"; 736 #print FAILLOG "Error - rtf-converter - couldn't extract text\n"; 623 737 print FAILLOG " (rtf file might be too recent):\n"; 624 738 open (ERRLOG, "$output_filestem.err"); … … 687 801 close FAILLOG; 688 802 } 689 &util::rm("$output_filestem.err");803 &util::rm("$output_filestem.err"); 690 804 } 691 805 return 0; 692 806 } 693 807 694 808 &util::rm("$output_filestem.err") if (-e "$output_filestem.err"); 695 809 &util::rm("$output_filestem.out") if (-e "$output_filestem.out"); … … 785 899 $cmd .= " 2> $output_filestem.err"; 786 900 $!=0; 787 788 901 my $retcode=system($cmd); 789 902 $retcode = $? >> 8; # see man perlfunc - system for this...
Note:
See TracChangeset
for help on using the changeset viewer.