Changeset 2755 for trunk/gsdl/bin/script/gsConvert.pl
- Timestamp:
- 2001-09-26T10:43:44+12:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/bin/script/gsConvert.pl
r2656 r2755 28 28 29 29 # gsConvert.pl converts documents in a range of formats to HTML or TEXT 30 # by exploiting third-party programs. These are usually found in the 31 # $GSDLHOME/packages directory. 32 # 33 # Currently, we can convert Microsoft Word and Adobe PDF using specialised 34 # conversion utilities. We can convery any file to text with a perl 35 # implementation of the UNIX strings command. 30 # by exploiting third-party programs. The sources of these are usually found 31 # in the $GSDLHOME/packages directory, and the executables should live in 32 # $GSDLHOME/bin/$GSDLOS (which is on the search path). 33 # 34 # Currently, we can convert Microsoft Word, RTF, Adobe PDF and PostScript 35 # using specialised conversion utilities. We can try to convert any file to 36 # text with a perl implementation of the UNIX strings command. 36 37 # 37 38 # We try to convert Postscript files to text using "gs" which is often on 38 # *nix machines. If it isn't (or we're running on Windoze), we do some feeble39 # text extraction on it using regexps.39 # *nix machines. We fall back to performing weak text extraction by using 40 # regular expressions. 40 41 41 42 BEGIN { … … 49 50 use File::Basename; 50 51 52 # Are we running on WinNT or Win2000 (or later)? 53 my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;}; 54 if (!defined($is_winnt_2000)) {$is_winnt_2000=0;} 51 55 52 56 sub print_usage … … 56 60 print STDERR " or text using third-party programs.\n\n"; 57 61 print STDERR " usage: $0 [options] filename\n"; 58 print STDERR " options:\n\t-type\tdoc|pdf|ps|rtf\n\t-output\thtml|text\n"; 59 print STDERR "\t-timeout\t<max cpu seconds>\n"; 62 print STDERR " options:\n\t-type\tdoc|pdf|ps|rtf\t(input file type)\n"; 63 print STDERR "\t-errlog\t<filename>\t(append err messages)\n"; 64 print STDERR "\t-output\thtml|text\n"; 65 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n"; 60 66 exit(1); 61 67 } 62 68 69 my $faillogfile=""; 63 70 64 71 sub main … … 71 78 if (!parsargv::parse(\@ARGV, 72 79 'type/(doc|pdf|ps|rtf)/', \$input_type, 80 '/errlog/.*/', \$faillogfile, 73 81 'output/(html|text)/', \$output_type, 74 82 'timeout/\d+/0',\$timeout, … … 198 206 } 199 207 200 return &convertAnything($input_filename, $output_filestem, $output_type); 208 # rtf is so ugly that's it's not worth running strings over. 209 # One day I'll write some quick'n'dirty regexps to try to extract text - jrm21 210 # return &convertAnything($input_filename, $output_filestem, $output_type); 211 return "fail"; 201 212 } 202 213 … … 232 243 233 244 sub convertPDF { 234 ($dirname, $input_filename, $output_filestem, $output_type) = @_;245 my ($dirname, $input_filename, $output_filestem, $output_type) = @_; 235 246 236 247 my $success = 0; … … 300 311 return "rtf"; 301 312 } 313 $first = 0; 302 314 } 303 315 … … 308 320 } 309 321 310 $first = 0;311 312 322 } 313 323 … … 320 330 # 321 331 # Each of the following functions attempts to convert a document from 322 # a specific format to another. If they succeed yhey return 1 and leave332 # a specific format to another. If they succeed they return 1 and leave 323 333 # the output document(s) in the appropriate place; if they fail they 324 334 # return 0 and delete any working files. … … 348 358 # redirecting STDERR is a bad idea on windows 95/98 349 359 $cmd .= " 2> \"$output_filestem.err\"" 350 if $ENV{'GSDLOS'} !~ /^windows$/i;360 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000); 351 361 352 362 # execute the command 363 $!=0; 353 364 if (system($cmd)!=0) 354 365 { 355 print STDERR "Error executing wv converter: $!. Continuing...\n"; 366 print STDERR "Error executing wv converter:$!\n"; 367 if (-s "$output_filestem.err") { 368 open (ERRFILE, "<$output_filestem.err"); 369 370 my $write_to_fail_log=0; 371 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) 372 {$write_to_fail_log=1;} 373 374 my $line; 375 while ($line=<ERRFILE>) { 376 if ($line =~ /\w/) { 377 print STDERR "$line"; 378 print FAILLOG "$line" if ($write_to_fail_log); 379 } 380 if ($line !~ m/startup error/) {next;} 381 print STDERR " (given an invalid .DOC file?)\n"; 382 print FAILLOG " (given an invalid .DOC file?)\n" 383 if ($write_to_fail_log); 384 385 } # while ERRFILE 386 close FAILLOG if ($write_to_fail_log); 387 } 388 print STDERR "Continuing...\n"; 389 return 0; # we can try any_to_text 356 390 } 357 391 … … 365 399 &util::rm("$output_filestem.err") if -e "$output_filestem.err"; 366 400 return 1; 367 } else { 368 # An error of some sort occurred 369 &util::rm("$output_filestem.html") if -e "$output_filestem.html"; 370 &util::rm("$output_filestem.err") if -e "$output_filestem.err"; 371 } 372 } 373 401 } 402 } 403 404 # If here, an error of some sort occurred 405 &util::rm("$output_filestem.html") if -e "$output_filestem.html"; 406 if (-e "$output_filestem.err") { 407 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) { 408 open (ERRLOG,"$output_filestem.err"); 409 while (<ERRLOG>) {print FAILLOG $_;} 410 close FAILLOG; 411 close ERRLOG; 412 } 413 &util::rm("$output_filestem.err"); 414 } 415 374 416 return 0; 375 417 } … … 390 432 391 433 $cmd .= " 2>\"$output_filestem.err\"" 392 unless $ENV{'GSDLOS'} =~ /^windows$/i;434 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000); 393 435 394 436 395 437 # execute the command 438 $!=0; 396 439 if (system($cmd)!=0) 397 440 { 398 print STDERR "Error executing rtf converter : $!.\n";441 print STDERR "Error executing rtf converter $!\n"; 399 442 # don't currently bother printing out error log... 400 443 # keep going, in case it still created an HTML file... 401 444 } 402 445 403 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");404 405 446 # Was the conversion successful? 447 my $was_successful=0; 406 448 if (-s "$output_filestem.html") { 407 return 1; 449 # make sure we have some content other than header 450 open (HTML, "$output_filestem.html"); # what to do if fail? 451 my $line; 452 my $past_header=0; 453 while ($line=<HTML>) { 454 455 if ($past_header == 0) { 456 if ($line =~ /<body>/) {$past_header=1;} 457 next; 458 } 459 460 $line =~ s/<[^>]+>//g; 461 if ($line =~ /\w/ && $past_header) { # we found some content... 462 $was_successful=1; 463 last; 464 } 465 } 466 close HTML; 467 } 468 469 if ($was_successful) { 470 &util::rm("$output_filestem.err") 471 if (-e "$output_filestem.err"); 472 # insert the (modified) table of contents, if it exists. 473 if (-e "${output_filestem}_ToC.html") { 474 &util::mv("$output_filestem.html","$output_filestem.src"); 475 my $open_failed=0; 476 open HTMLSRC, "$output_filestem.src" || ++$open_failed; 477 open TOC, "${output_filestem}_ToC.html" || ++$open_failed; 478 open HTML, ">$output_filestem.html" || ++$open_failed; 479 480 if ($open_failed) { 481 close HTMLSRC; 482 close TOC; 483 close HTML; 484 &util::mv("$output_filestem.src","$output_filestem.html"); 485 return 1; 486 } 487 488 # print out header info from src html. 489 while (($_ = <HTMLSRC>) =~ /\w/) { 490 print HTML "$_"; 491 } 492 493 # print out table of contents, making links relative 494 <TOC>; <TOC>; # ignore first 2 lines 495 print HTML scalar(<TOC>); # line 3 = "<ol>\n" 496 my $line; 497 while ($line=<TOC>) { 498 $line =~ s@</body></html>$@@ ; # only last line has this 499 # make link relative 500 $line =~ s@href=\"[^\#]+@href=\"@; 501 print HTML $line; 502 } 503 close TOC; 504 505 # rest of html src 506 while (<HTMLSRC>) { 507 print HTML $_; 508 } 509 close HTMLSRC; 510 close HTML; 511 512 &util::rm("${output_filestem}_ToC.html"); 513 &util::rm("${output_filestem}.src"); 514 } 515 # we don't yet do anything with footnotes ($output_filestem_fn.html) :( 516 return 1; # success 517 } 518 519 if (-e "$output_filestem.err") { 520 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) 521 { 522 print FAILLOG "Error - rtftohtml - couldn't extract text\n"; 523 print FAILLOG " (rtf file might be too recent):\n"; 524 open (ERRLOG, "$output_filestem.err"); 525 while (<ERRLOG>) {print FAILLOG $_;} 526 close ERRLOG; 527 close FAILLOG; 528 } 529 &util::rm("$output_filestem.err"); 408 530 } 409 531 … … 417 539 418 540 sub pdf_to_html { 419 ($dirname, $input_filename, $output_filestem) = @_;541 my ($dirname, $input_filename, $output_filestem) = @_; 420 542 421 543 $cmd = ""; 422 544 if ($timeout) {$cmd = "ulimit -t $timeout;";} 423 $cmd .= "perl -S pdftohtml.pl -F";545 $cmd .= "perl -S pdftohtml.pl "; 424 546 $cmd .= " \"$input_filename\" \"$output_filestem\""; 547 548 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) { 549 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\""; 550 } else { 551 $cmd .= " > \"$output_filestem.err\""; 552 } 553 425 554 $!=0; 426 555 … … 428 557 if ($retval!=0) 429 558 { 430 print STDERR "Error executing $cmd";559 print STDERR "Error executing pdftohtml.pl"; 431 560 if ($!) {print STDERR ": $!";} 432 561 print STDERR "\n"; … … 440 569 if (-s "$output_filestem.err") { 441 570 open (ERRLOG, "$output_filestem.err") || die "$!"; 442 print STDERR "pdftohtml :\n";571 print STDERR "pdftohtml error log:\n"; 443 572 while (<ERRLOG>) { 444 573 print STDERR "$_"; … … 447 576 } 448 577 &util::rm("$output_filestem.html") if (-e "$output_filestem.html"); 449 &util::rm("$output_filestem.err") if (-e "$output_filestem.err"); 578 if (-e "$output_filestem.err") { 579 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) 580 { 581 open (ERRLOG, "$output_filestem.err"); 582 while (<ERRLOG>) {print FAILLOG $_;} 583 close ERRLOG; 584 close FAILLOG; 585 } 586 &util::rm("$output_filestem.err"); 587 } 450 588 return 0; 451 589 } … … 459 597 460 598 sub pdf_to_text { 461 ($dirname, $input_filename, $output_filestem) = @_;599 my ($dirname, $input_filename, $output_filestem) = @_; 462 600 463 601 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\""; 464 $cmd .= " 2> \"$output_filestem.err\""; 602 603 if ($ENV{'GSDLOS'} !~ /^windows$/i) { 604 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\""; 605 } else { 606 $cmd .= " > \"$output_filestem.err\""; 607 } 465 608 466 609 if (system($cmd)!=0) … … 468 611 print STDERR "Error executing $cmd: $!\n"; 469 612 &util::rm("$output_filestem.text") if (-e "$output_filestem.text"); 470 &util::rm("$output_filestem.err") if (-e "$output_filestem.err"); 471 return 0; 613 } 614 615 # make sure there is some extracted text. 616 if (-e "$output_filestem.text") { 617 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!"; 618 binmode(EXTR_TEXT); # just in case... 619 my $line=""; 620 my $seen_text=0; 621 while (($seen_text==0) && ($line=<EXTR_TEXT>)) { 622 if ($line=~ /\w/) {$seen_text=1;} 623 } 624 close EXTR_TEXT; 625 if ($seen_text==0) { # no text was extracted 626 print STDERR "Error: pdftotext found no text\n"; 627 &util::rm("$output_filestem.text"); 628 } 472 629 } 473 630 … … 478 635 if (-s "$output_filestem.err") { 479 636 open (ERRLOG, "$output_filestem.err") || die "$!"; 480 print STDERR "pdftotext :\n";637 print STDERR "pdftotext error log:\n"; 481 638 while (<ERRLOG>) { 482 639 print STDERR "$_"; … … 487 644 &util::rm("$output_filestem.out") if (-e "$output_filestem.out"); 488 645 &util::rm("$output_filestem.text") if (-e "$output_filestem.text"); 489 &util::rm("$output_filestem.err") if (-e "$output_filestem.err"); 490 646 if (-e "$output_filestem.err") { 647 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) 648 { 649 open (ERRLOG,"$output_filestem.err"); 650 while (<ERRLOG>) {print FAILLOG $_;} 651 close ERRLOG; 652 close FAILLOG; 653 } 654 &util::rm("$output_filestem.err"); 655 } 491 656 return 0; 492 657 } … … 537 702 if ($error ne "") 538 703 { 539 print STDERR " PSPlug: WARNING: Error executing gs: $error\n";704 print STDERR "Warning: Error executing gs: $error\n"; 540 705 &util::rm("$output_filestem.text") if (-e "$output_filestem.text"); 706 707 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile"))) 708 { 709 print FAILLOG "gs - $error\n"; 710 if (-e "$output_filestem.err") { 711 open(ERRLOG, "$output_filestem.err"); 712 while (<ERRLOG>) {print FAILLOG $_;} 713 close ERRLOG; 714 } 715 close FAILLOG; 716 } 541 717 &util::rm("$output_filestem.err") if (-e "$output_filestem.err"); 718 542 719 543 720 # Fine then. We'll just do a lousy job by ourselves... … … 545 722 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html 546 723 # 547 print STDERR " PSPlug:Stripping text from postscript\n";724 print STDERR "Stripping text from postscript\n"; 548 725 my $errorcode=0; 549 726 open (IN, "$input_filename") … … 554 731 555 732 my $text=""; # this is for whole .ps file... 556 while (<IN>) { 557 $text.=$_; 558 } 733 $text = join('', <IN>); # see man perlport, under "System Resources" 559 734 close IN; 560 735 561 736 # Make sure this is a ps file... 562 737 if ($text !~ /^%!/) { 563 print STDERR "Bad postscript header: not %!\n"; 738 print STDERR "Bad postscript header: not '%!'\n"; 739 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile"))) 740 { 741 print FAILLOG "Bad postscript header: not '%!'\n"; 742 close FAILLOG; 743 } 564 744 return 0; 565 745 } … … 666 846 print HTML "</head><body>\n\n"; 667 847 668 while (<TEXT>) { 669 print HTML "<p> ", $_; 848 my $line; 849 while ($line=<TEXT>) { 850 $line =~ s/</</g; 851 $line =~ s/>/>/g; 852 if ($line =~ /^\s*$/) { 853 print HTML "<p>"; 854 } else { 855 print HTML "<br> ", $line; 856 } 670 857 } 671 858 print HTML "\n</body></html>\n"; … … 680 867 # Convert any file to TEXT with a crude perl implementation of the 681 868 # UNIX strings command. 869 # Note - this assumes ascii charsets :( (jrm21) 682 870 683 871 sub any_to_text { 684 872 ($input_filename, $output_filestem) = @_; 685 873 686 open(IN, "<$input_filename") ;874 open(IN, "<$input_filename") || return 0; 687 875 binmode(IN); 688 open(OUT, ">$output_filestem.text") ;876 open(OUT, ">$output_filestem.text") || return 0; 689 877 690 878 my ($line); 691 my $ dgcount = 0;879 my $output_line_count = 0; 692 880 while (<IN>) { 693 881 $line = $_; … … 710 898 if ($line =~ /[^\n ]/) { 711 899 print OUT $line; 900 ++$output_line_count; 712 901 } 713 902 } … … 716 905 close IN; 717 906 718 return 1; 719 } 907 if ($output_line_count) { # try to protect against binary only formats 908 return 1; 909 } 910 911 &util::rm("$output_filestem.text"); 912 return 0; 913 914 }
Note:
See TracChangeset
for help on using the changeset viewer.