Changeset 2755
- Timestamp:
- 2001-09-26T10:43:44+12:00 (23 years ago)
- Location:
- trunk/gsdl
- Files:
-
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/bin/script/gsConvert.pl
r2656 r2755 28 28 29 29 # gsConvert.pl converts documents in a range of formats to HTML or TEXT 30 # by exploiting third-party programs. These are usually found in the 31 # $GSDLHOME/packages directory. 32 # 33 # Currently, we can convert Microsoft Word and Adobe PDF using specialised 34 # conversion utilities. We can convery any file to text with a perl 35 # implementation of the UNIX strings command. 30 # by exploiting third-party programs. The sources of these are usually found 31 # in the $GSDLHOME/packages directory, and the executables should live in 32 # $GSDLHOME/bin/$GSDLOS (which is on the search path). 33 # 34 # Currently, we can convert Microsoft Word, RTF, Adobe PDF and PostScript 35 # using specialised conversion utilities. We can try to convert any file to 36 # text with a perl implementation of the UNIX strings command. 36 37 # 37 38 # We try to convert Postscript files to text using "gs" which is often on 38 # *nix machines. If it isn't (or we're running on Windoze), we do some feeble39 # text extraction on it using regexps.39 # *nix machines. We fall back to performing weak text extraction by using 40 # regular expressions. 40 41 41 42 BEGIN { … … 49 50 use File::Basename; 50 51 52 # Are we running on WinNT or Win2000 (or later)? 53 my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;}; 54 if (!defined($is_winnt_2000)) {$is_winnt_2000=0;} 51 55 52 56 sub print_usage … … 56 60 print STDERR " or text using third-party programs.\n\n"; 57 61 print STDERR " usage: $0 [options] filename\n"; 58 print STDERR " options:\n\t-type\tdoc|pdf|ps|rtf\n\t-output\thtml|text\n"; 59 print STDERR "\t-timeout\t<max cpu seconds>\n"; 62 print STDERR " options:\n\t-type\tdoc|pdf|ps|rtf\t(input file type)\n"; 63 print STDERR "\t-errlog\t<filename>\t(append err messages)\n"; 64 print STDERR "\t-output\thtml|text\n"; 65 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n"; 60 66 exit(1); 61 67 } 62 68 69 my $faillogfile=""; 63 70 64 71 sub main … … 71 78 if (!parsargv::parse(\@ARGV, 72 79 'type/(doc|pdf|ps|rtf)/', \$input_type, 80 '/errlog/.*/', \$faillogfile, 73 81 'output/(html|text)/', \$output_type, 74 82 'timeout/\d+/0',\$timeout, … … 198 206 } 199 207 200 return &convertAnything($input_filename, $output_filestem, $output_type); 208 # rtf is so ugly that's it's not worth running strings over. 209 # One day I'll write some quick'n'dirty regexps to try to extract text - jrm21 210 # return &convertAnything($input_filename, $output_filestem, $output_type); 211 return "fail"; 201 212 } 202 213 … … 232 243 233 244 sub convertPDF { 234 ($dirname, $input_filename, $output_filestem, $output_type) = @_;245 my ($dirname, $input_filename, $output_filestem, $output_type) = @_; 235 246 236 247 my $success = 0; … … 300 311 return "rtf"; 301 312 } 313 $first = 0; 302 314 } 303 315 … … 308 320 } 309 321 310 $first = 0;311 312 322 } 313 323 … … 320 330 # 321 331 # Each of the following functions attempts to convert a document from 322 # a specific format to another. If they succeed yhey return 1 and leave332 # a specific format to another. If they succeed they return 1 and leave 323 333 # the output document(s) in the appropriate place; if they fail they 324 334 # return 0 and delete any working files. … … 348 358 # redirecting STDERR is a bad idea on windows 95/98 349 359 $cmd .= " 2> \"$output_filestem.err\"" 350 if $ENV{'GSDLOS'} !~ /^windows$/i;360 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000); 351 361 352 362 # execute the command 363 $!=0; 353 364 if (system($cmd)!=0) 354 365 { 355 print STDERR "Error executing wv converter: $!. Continuing...\n"; 366 print STDERR "Error executing wv converter:$!\n"; 367 if (-s "$output_filestem.err") { 368 open (ERRFILE, "<$output_filestem.err"); 369 370 my $write_to_fail_log=0; 371 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) 372 {$write_to_fail_log=1;} 373 374 my $line; 375 while ($line=<ERRFILE>) { 376 if ($line =~ /\w/) { 377 print STDERR "$line"; 378 print FAILLOG "$line" if ($write_to_fail_log); 379 } 380 if ($line !~ m/startup error/) {next;} 381 print STDERR " (given an invalid .DOC file?)\n"; 382 print FAILLOG " (given an invalid .DOC file?)\n" 383 if ($write_to_fail_log); 384 385 } # while ERRFILE 386 close FAILLOG if ($write_to_fail_log); 387 } 388 print STDERR "Continuing...\n"; 389 return 0; # we can try any_to_text 356 390 } 357 391 … … 365 399 &util::rm("$output_filestem.err") if -e "$output_filestem.err"; 366 400 return 1; 367 } else { 368 # An error of some sort occurred 369 &util::rm("$output_filestem.html") if -e "$output_filestem.html"; 370 &util::rm("$output_filestem.err") if -e "$output_filestem.err"; 371 } 372 } 373 401 } 402 } 403 404 # If here, an error of some sort occurred 405 &util::rm("$output_filestem.html") if -e "$output_filestem.html"; 406 if (-e "$output_filestem.err") { 407 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) { 408 open (ERRLOG,"$output_filestem.err"); 409 while (<ERRLOG>) {print FAILLOG $_;} 410 close FAILLOG; 411 close ERRLOG; 412 } 413 &util::rm("$output_filestem.err"); 414 } 415 374 416 return 0; 375 417 } … … 390 432 391 433 $cmd .= " 2>\"$output_filestem.err\"" 392 unless $ENV{'GSDLOS'} =~ /^windows$/i;434 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000); 393 435 394 436 395 437 # execute the command 438 $!=0; 396 439 if (system($cmd)!=0) 397 440 { 398 print STDERR "Error executing rtf converter : $!.\n";441 print STDERR "Error executing rtf converter $!\n"; 399 442 # don't currently bother printing out error log... 400 443 # keep going, in case it still created an HTML file... 401 444 } 402 445 403 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");404 405 446 # Was the conversion successful? 447 my $was_successful=0; 406 448 if (-s "$output_filestem.html") { 407 return 1; 449 # make sure we have some content other than header 450 open (HTML, "$output_filestem.html"); # what to do if fail? 451 my $line; 452 my $past_header=0; 453 while ($line=<HTML>) { 454 455 if ($past_header == 0) { 456 if ($line =~ /<body>/) {$past_header=1;} 457 next; 458 } 459 460 $line =~ s/<[^>]+>//g; 461 if ($line =~ /\w/ && $past_header) { # we found some content... 462 $was_successful=1; 463 last; 464 } 465 } 466 close HTML; 467 } 468 469 if ($was_successful) { 470 &util::rm("$output_filestem.err") 471 if (-e "$output_filestem.err"); 472 # insert the (modified) table of contents, if it exists. 473 if (-e "${output_filestem}_ToC.html") { 474 &util::mv("$output_filestem.html","$output_filestem.src"); 475 my $open_failed=0; 476 open HTMLSRC, "$output_filestem.src" || ++$open_failed; 477 open TOC, "${output_filestem}_ToC.html" || ++$open_failed; 478 open HTML, ">$output_filestem.html" || ++$open_failed; 479 480 if ($open_failed) { 481 close HTMLSRC; 482 close TOC; 483 close HTML; 484 &util::mv("$output_filestem.src","$output_filestem.html"); 485 return 1; 486 } 487 488 # print out header info from src html. 489 while (($_ = <HTMLSRC>) =~ /\w/) { 490 print HTML "$_"; 491 } 492 493 # print out table of contents, making links relative 494 <TOC>; <TOC>; # ignore first 2 lines 495 print HTML scalar(<TOC>); # line 3 = "<ol>\n" 496 my $line; 497 while ($line=<TOC>) { 498 $line =~ s@</body></html>$@@ ; # only last line has this 499 # make link relative 500 $line =~ s@href=\"[^\#]+@href=\"@; 501 print HTML $line; 502 } 503 close TOC; 504 505 # rest of html src 506 while (<HTMLSRC>) { 507 print HTML $_; 508 } 509 close HTMLSRC; 510 close HTML; 511 512 &util::rm("${output_filestem}_ToC.html"); 513 &util::rm("${output_filestem}.src"); 514 } 515 # we don't yet do anything with footnotes ($output_filestem_fn.html) :( 516 return 1; # success 517 } 518 519 if (-e "$output_filestem.err") { 520 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) 521 { 522 print FAILLOG "Error - rtftohtml - couldn't extract text\n"; 523 print FAILLOG " (rtf file might be too recent):\n"; 524 open (ERRLOG, "$output_filestem.err"); 525 while (<ERRLOG>) {print FAILLOG $_;} 526 close ERRLOG; 527 close FAILLOG; 528 } 529 &util::rm("$output_filestem.err"); 408 530 } 409 531 … … 417 539 418 540 sub pdf_to_html { 419 ($dirname, $input_filename, $output_filestem) = @_;541 my ($dirname, $input_filename, $output_filestem) = @_; 420 542 421 543 $cmd = ""; 422 544 if ($timeout) {$cmd = "ulimit -t $timeout;";} 423 $cmd .= "perl -S pdftohtml.pl -F";545 $cmd .= "perl -S pdftohtml.pl "; 424 546 $cmd .= " \"$input_filename\" \"$output_filestem\""; 547 548 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) { 549 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\""; 550 } else { 551 $cmd .= " > \"$output_filestem.err\""; 552 } 553 425 554 $!=0; 426 555 … … 428 557 if ($retval!=0) 429 558 { 430 print STDERR "Error executing $cmd";559 print STDERR "Error executing pdftohtml.pl"; 431 560 if ($!) {print STDERR ": $!";} 432 561 print STDERR "\n"; … … 440 569 if (-s "$output_filestem.err") { 441 570 open (ERRLOG, "$output_filestem.err") || die "$!"; 442 print STDERR "pdftohtml :\n";571 print STDERR "pdftohtml error log:\n"; 443 572 while (<ERRLOG>) { 444 573 print STDERR "$_"; … … 447 576 } 448 577 &util::rm("$output_filestem.html") if (-e "$output_filestem.html"); 449 &util::rm("$output_filestem.err") if (-e "$output_filestem.err"); 578 if (-e "$output_filestem.err") { 579 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) 580 { 581 open (ERRLOG, "$output_filestem.err"); 582 while (<ERRLOG>) {print FAILLOG $_;} 583 close ERRLOG; 584 close FAILLOG; 585 } 586 &util::rm("$output_filestem.err"); 587 } 450 588 return 0; 451 589 } … … 459 597 460 598 sub pdf_to_text { 461 ($dirname, $input_filename, $output_filestem) = @_;599 my ($dirname, $input_filename, $output_filestem) = @_; 462 600 463 601 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\""; 464 $cmd .= " 2> \"$output_filestem.err\""; 602 603 if ($ENV{'GSDLOS'} !~ /^windows$/i) { 604 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\""; 605 } else { 606 $cmd .= " > \"$output_filestem.err\""; 607 } 465 608 466 609 if (system($cmd)!=0) … … 468 611 print STDERR "Error executing $cmd: $!\n"; 469 612 &util::rm("$output_filestem.text") if (-e "$output_filestem.text"); 470 &util::rm("$output_filestem.err") if (-e "$output_filestem.err"); 471 return 0; 613 } 614 615 # make sure there is some extracted text. 616 if (-e "$output_filestem.text") { 617 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!"; 618 binmode(EXTR_TEXT); # just in case... 619 my $line=""; 620 my $seen_text=0; 621 while (($seen_text==0) && ($line=<EXTR_TEXT>)) { 622 if ($line=~ /\w/) {$seen_text=1;} 623 } 624 close EXTR_TEXT; 625 if ($seen_text==0) { # no text was extracted 626 print STDERR "Error: pdftotext found no text\n"; 627 &util::rm("$output_filestem.text"); 628 } 472 629 } 473 630 … … 478 635 if (-s "$output_filestem.err") { 479 636 open (ERRLOG, "$output_filestem.err") || die "$!"; 480 print STDERR "pdftotext :\n";637 print STDERR "pdftotext error log:\n"; 481 638 while (<ERRLOG>) { 482 639 print STDERR "$_"; … … 487 644 &util::rm("$output_filestem.out") if (-e "$output_filestem.out"); 488 645 &util::rm("$output_filestem.text") if (-e "$output_filestem.text"); 489 &util::rm("$output_filestem.err") if (-e "$output_filestem.err"); 490 646 if (-e "$output_filestem.err") { 647 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) 648 { 649 open (ERRLOG,"$output_filestem.err"); 650 while (<ERRLOG>) {print FAILLOG $_;} 651 close ERRLOG; 652 close FAILLOG; 653 } 654 &util::rm("$output_filestem.err"); 655 } 491 656 return 0; 492 657 } … … 537 702 if ($error ne "") 538 703 { 539 print STDERR " PSPlug: WARNING: Error executing gs: $error\n";704 print STDERR "Warning: Error executing gs: $error\n"; 540 705 &util::rm("$output_filestem.text") if (-e "$output_filestem.text"); 706 707 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile"))) 708 { 709 print FAILLOG "gs - $error\n"; 710 if (-e "$output_filestem.err") { 711 open(ERRLOG, "$output_filestem.err"); 712 while (<ERRLOG>) {print FAILLOG $_;} 713 close ERRLOG; 714 } 715 close FAILLOG; 716 } 541 717 &util::rm("$output_filestem.err") if (-e "$output_filestem.err"); 718 542 719 543 720 # Fine then. We'll just do a lousy job by ourselves... … … 545 722 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html 546 723 # 547 print STDERR " PSPlug:Stripping text from postscript\n";724 print STDERR "Stripping text from postscript\n"; 548 725 my $errorcode=0; 549 726 open (IN, "$input_filename") … … 554 731 555 732 my $text=""; # this is for whole .ps file... 556 while (<IN>) { 557 $text.=$_; 558 } 733 $text = join('', <IN>); # see man perlport, under "System Resources" 559 734 close IN; 560 735 561 736 # Make sure this is a ps file... 562 737 if ($text !~ /^%!/) { 563 print STDERR "Bad postscript header: not %!\n"; 738 print STDERR "Bad postscript header: not '%!'\n"; 739 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile"))) 740 { 741 print FAILLOG "Bad postscript header: not '%!'\n"; 742 close FAILLOG; 743 } 564 744 return 0; 565 745 } … … 666 846 print HTML "</head><body>\n\n"; 667 847 668 while (<TEXT>) { 669 print HTML "<p> ", $_; 848 my $line; 849 while ($line=<TEXT>) { 850 $line =~ s/</</g; 851 $line =~ s/>/>/g; 852 if ($line =~ /^\s*$/) { 853 print HTML "<p>"; 854 } else { 855 print HTML "<br> ", $line; 856 } 670 857 } 671 858 print HTML "\n</body></html>\n"; … … 680 867 # Convert any file to TEXT with a crude perl implementation of the 681 868 # UNIX strings command. 869 # Note - this assumes ascii charsets :( (jrm21) 682 870 683 871 sub any_to_text { 684 872 ($input_filename, $output_filestem) = @_; 685 873 686 open(IN, "<$input_filename") ;874 open(IN, "<$input_filename") || return 0; 687 875 binmode(IN); 688 open(OUT, ">$output_filestem.text") ;876 open(OUT, ">$output_filestem.text") || return 0; 689 877 690 878 my ($line); 691 my $ dgcount = 0;879 my $output_line_count = 0; 692 880 while (<IN>) { 693 881 $line = $_; … … 710 898 if ($line =~ /[^\n ]/) { 711 899 print OUT $line; 900 ++$output_line_count; 712 901 } 713 902 } … … 716 905 close IN; 717 906 718 return 1; 719 } 907 if ($output_line_count) { # try to protect against binary only formats 908 return 1; 909 } 910 911 &util::rm("$output_filestem.text"); 912 return 0; 913 914 } -
trunk/gsdl/bin/script/import.pl
r2531 r2755 80 80 print STDOUT " -collectdir directory Collection directory (defaults to " . 81 81 &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n"; 82 print STDOUT " -out Filename or handle to print output status to.\n"; 83 print STDOUT " The default is STDERR\n\n"; 82 print STDOUT " -out name Filename or handle to print output status to.\n"; 83 print STDOUT " -faillog name Filename to log import failures to.\n"; 84 print STDOUT " The default is <collectdir>/colname/etc/fail.log\n\n"; 84 85 print STDOUT " [Type \"perl -S import.pl | more\" if this help text scrolled off your screen]"; 85 86 print STDOUT "\n" unless $ENV{'GSDLOS'} =~ /^windows$/i; … … 93 94 $maxdocs, $collection, $configfilename, $collectcfg, 94 95 $pluginfo, $sortmeta, $archive_info_filename, 95 $archive_info, $processor, $out, $ collectdir);96 $archive_info, $processor, $out, $faillogname, $collectdir); 96 97 97 98 # note that no defaults are passed for most options as they're set … … 110 111 'maxdocs/^\-?\d+/', \$maxdocs, 111 112 'collectdir/.*/', \$collectdir, 112 'out/.*/STDERR', \$out)) { 113 'out/.*/STDERR', \$out, 114 'faillog/.*/', \$faillogname)) { 113 115 &print_usage(); 114 116 die "\n"; … … 131 133 die "\n"; 132 134 } 135 136 # check and/or set fail log file 137 if ($faillogname eq "") { 138 $faillogname="$ENV{GSDLCOLLECTDIR}/etc/fail.log"; 139 } else { 140 my $can_open=1; 141 open (TESTOPEN, ">$faillogname") || ($can_open=0); 142 close (TESTOPEN); 143 if ($can_open==0) { 144 warn "fail.log - cannot write to \"$faillogname\", using default\n \"$ENV{GSDLCOLLECTDIR}/etc/fail.log\" instead.\n"; 145 $faillogname="$ENV{GSDLCOLLECTDIR}/etc/fail.log"; 146 } 147 } 148 # test that default is writable... 149 if ($faillogname eq "$ENV{GSDLCOLLECTDIR}/etc/fail.log") { 150 my $can_open=1; 151 open (TESTOPEN, ">$faillogname") || ($can_open=0); 152 close (TESTOPEN); 153 if ($can_open==0) { 154 warn "warning - cannot write to \"$faillogname\".\n"; 155 $faillogname=""; 156 } 157 } 158 133 159 134 160 # check sortmeta … … 149 175 # get the list of plugins for this collection and set any options that 150 176 # were specified in the collect.cfg (all import.pl options except 151 # -collectdir and -outmay be specified in the collect.cfg (these177 # -collectdir, -out and -faillog may be specified in the collect.cfg (these 152 178 # options must be known before we read the collect.cfg)) 153 179 my $plugins = []; … … 232 258 233 259 # load all the plugins 234 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out );260 $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillogname); 235 261 if (scalar(@$pluginfo) == 0) { 236 262 print $out "No plugins were loaded.\n"; -
trunk/gsdl/bin/script/pdftohtml.pl
r2743 r2755 45 45 # note - we don't actually ever use most of these options... 46 46 print STDERR 47 ("pdftohtml version 0.22 - modified for NZDL use\n",47 ("pdftohtml.pl wrapper for pdftohtml version 0.22, modified for GSDL use.\n", 48 48 "Usage: pdftohtml [options] <PDF-file> <html-file>\n", 49 " -f <int> : first page to convert\n", 50 " -l <int> : last page to convert\n", 51 " -d <dir> : target directory (default: basename of pdf-file)\n", 52 " -o <file> : name of output file; - means stdout (default index.html)\n", 53 " -q : don't print any messages or errors\n", 54 " -h : print this usage information\n", 55 " -p : exchange .pdf links by .html\n", 56 # these options now have no effect in gs-custom pdftohtml 57 # " -c : generate complex HTML document\n", 58 # " -F : don't use frames in HTML document\n", 59 " -i : ignore images\n", 60 " -e <string> : set extension for images (in the Html-file) (default png)\n" 49 "Options:\n", 50 "\t-i\tignore images (don't extract)\n", 51 "\t-a\tallow images only (continue even if no text is present)\n" 61 52 ); 62 53 exit (1); … … 65 56 sub main { 66 57 my (@ARGV) = @_; 67 my ($first,$last,$target_dir,$out_file,$img_ext, 68 $optq,$opth,$optp,$optF,$opti); 58 my ($allow_no_text,$ignore_images); 69 59 70 60 # read command-line arguments so that 71 61 # you can change the command in this script 72 62 if (!parsargv::parse(\@ARGV, 73 'f/\d+/1', \$first, 74 'l/\d+/1', \$last, 75 'd/[\S]*/', \$target_dir, 76 'o/[\S]*/', \$out_file, 77 'e/[\S]*/', \$img_ext, 78 'q', \$optq, 79 'h', \$opth, 80 'p', \$optp, 81 # 'c', \$optc, 82 'F', \$optF, 83 'i', \$opti 63 'a', \$allow_no_text, 64 'i', \$ignore_images 84 65 )) 85 66 { … … 119 100 $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/); 120 101 102 $cmd .= " -i" if ($ignore_images); 121 103 $cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\""; 122 $cmd .= " > \"$output_filestem.out\"";123 124 # attempting to redirect STDERR on windows 95/98 is a bad idea125 $cmd .= " 2> \"$output_filestem.err\""126 if $ENV{'GSDLOS'} !~ /^windows$/i;127 104 128 105 # system() returns -1 if it can't run, otherwise it's $cmds ret val. 129 106 # note we return 0 if the file is "encrypted" 107 $!=0; 130 108 if (system($cmd)!=0) { 131 print STDERR " Error executing $cmd:$!\n";109 print STDERR "pdftohtml error for $input_filename $!\n"; 132 110 # leave these for gsConvert.pl... 133 111 #&util::rm("$output_filestem.text") if (-e "$output_filestem.text"); 134 112 #&util::rm("$output_filestem.err") if (-e "$output_filestem.err"); 135 return 0;113 return 1; 136 114 } 137 115 138 116 if (! -e "$output_filestem.html") { 139 return 0;117 return 1; 140 118 } 141 119 142 120 # post-process to remove </b><b> and </i><i>, as these break up 143 121 # words, screwing up indexing and searching. 122 # At the same time, check that our .html file has some textual content. 144 123 &util::mv("$output_filestem.html","$output_filestem.html.tmp"); 124 $!=0; 145 125 open INFILE, "$output_filestem.html.tmp" || 146 126 die "Couldn't open file: $!"; … … 148 128 die "Couldn't open file for writing: $!"; 149 129 my $line; 130 my $seen_textual_content=$allow_no_text; 150 131 while ($line=<INFILE>) { 151 132 $line =~ s#</b><b>##g; 152 133 $line =~ s#</i><i>##g; 153 134 $line =~ s#\\#\\\\#g; # until macro language parsing is fixed... 135 # check for any extracted text 136 if ($seen_textual_content == 0) { 137 my $tmp_line=$line; 138 $tmp_line =~ s/<[^>]*>//g; 139 $tmp_line =~ s/Page\s\d+//; 140 $tmp_line =~ s/\s*//g; 141 if ($tmp_line ne "") { 142 $seen_textual_content=1; 143 } 144 } 145 154 146 # escape underscores, but not if they're inside tags (eg img/href names) 155 147 my $inatag = 0; # allow multi-line tags … … 178 170 &util::rm("$output_filestem.html.tmp"); 179 171 180 181 172 # Need to convert images from PPM format to PNG format 182 173 my @images; … … 192 183 } 193 184 close IMAGES; 185 &util::rm("${directory}image.log") if (-e "${directory}image.log"); 186 187 # no need to go any further if there is no text extracted from pdf. 188 if ($seen_textual_content == 0) { 189 print STDERR "Error: PDF contains no extractable text\n"; 190 # remove images... 191 for $image (@images) { 192 chomp($image); 193 &util::rm("${directory}$image"); 194 } 195 return 1; 196 } 197 198 194 199 195 200 for $image (@images) { … … 200 205 if (system($cmd)!=0) { 201 206 print STDERR "Error executing $cmd\n"; 202 #return 0; # not sure about whether to leave this one in or take it out207 #return 1; # not sure about whether to leave this one in or take it out 203 208 next; 204 209 } … … 211 216 if (system($cmd)!=0) { 212 217 print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n"; 213 #return 0; # not sure about whether to leave this one in or take it out218 #return 1; # not sure about whether to leave this one in or take it out 214 219 next; 215 220 } … … 219 224 } 220 225 221 return 1;226 return 0; 222 227 } 223 228 224 # indicate our error status 225 if (&main(@ARGV)) {exit 0;} 226 exit 1; 229 # indicate our error status, 0 = success 230 exit (&main(@ARGV)); 231 -
trunk/gsdl/perllib/plugin.pm
r1587 r2755 30 30 sub load_plugins { 31 31 my ($plugin_list) = shift @_; 32 ($verbosity, $outhandle ) = @_; # globals32 ($verbosity, $outhandle, $faillogname) = @_; # globals 33 33 my @plugin_objects = (); 34 34 … … 58 58 59 59 # initialize plugin 60 $plugobj->init($verbosity, $outhandle );60 $plugobj->init($verbosity, $outhandle, $faillogname); 61 61 62 62 # add this object to the list -
trunk/gsdl/perllib/plugins/BasPlug.pm
r2751 r2755 175 175 sub init { 176 176 my $self = shift (@_); 177 my ($verbosity, $outhandle ) = @_;177 my ($verbosity, $outhandle, $faillogname) = @_; 178 178 179 179 # verbosity is passed through from the processor … … 182 182 # as is the outhandle ... 183 183 $self->{'outhandle'} = $outhandle if defined $outhandle; 184 $self->{'faillogname'} = $faillogname; 184 185 185 186 # set process_exp and block_exp to defaults unless they were -
trunk/gsdl/perllib/plugins/ConvertToPlug.pm
r2751 r2755 168 168 print $outhandle "Converting $tailname$suffix to $convert_to format\n"; 169 169 } 170 171 my $errlog = &util::filename_cat($tmp_dirname, "err.log"); 170 172 171 173 # Execute the conversion command and get the type of the result, 172 174 # making sure the converter gives us the appropriate output type 173 175 my $output_type = lc($convert_to); 174 my $cmd = "perl -S gsConvert.pl -verbose $verbosity - output $output_type \"$tmp_filename\"";176 my $cmd = "perl -S gsConvert.pl -verbose $verbosity -errlog \"$errlog\" -output $output_type \"$tmp_filename\""; 175 177 $output_type = `$cmd`; 176 178 … … 182 184 if ($output_type eq "fail") { 183 185 print $outhandle "Could not convert $tailname$suffix to $convert_to format\n"; 186 if ($self->{'faillogname'} ne "" && -s "$errlog") { 187 open(SAVELOG, ">>$self->{'faillogname'}"); 188 open(ERRLOG, "$errlog"); 189 print SAVELOG "$tailname$suffix (converting to $convert_to) failed:\n"; 190 while (<ERRLOG>) { 191 print SAVELOG "$_"; 192 } 193 close ERRLOG; 194 print SAVELOG "\n"; 195 close SAVELOG; 196 } 197 &util::rm("$errlog") if (-e "$errlog"); 184 198 return ""; 185 199 }
Note:
See TracChangeset
for help on using the changeset viewer.