Changeset 2755


Ignore:
Timestamp:
2001-09-26T10:43:44+12:00 (23 years ago)
Author:
jrm21
Message:

import.pl now takes an option for saving file conversion failures to a log.
By default, import.pl will use <collectdir>/etc/fail.log. Currently only
the plugins based on ConvertToPlug will do this. Not yet tested on Win9X.

Location:
trunk/gsdl
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/gsConvert.pl

    r2656 r2755  
    2828
    2929# gsConvert.pl converts documents in a range of formats to HTML or TEXT
    30 # by exploiting third-party programs.  These are usually found in the
    31 # $GSDLHOME/packages directory.
    32 #
    33 # Currently, we can convert Microsoft Word and Adobe PDF using specialised
    34 # conversion utilities.   We can convery any file to text with a perl
    35 # implementation of the UNIX strings command.
     30# by exploiting third-party programs.  The sources of these are usually found
     31# in the $GSDLHOME/packages directory, and the executables should live in
     32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
     33#
     34# Currently, we can convert Microsoft Word, RTF, Adobe PDF and PostScript
     35# using specialised conversion utilities.   We can try to convert any file to
     36# text with a perl implementation of the UNIX strings command.
    3637#
    3738# We try to convert Postscript files to text using "gs" which is often on
    38 # *nix machines. If it isn't (or we're running on Windoze), we do some feeble
    39 # text extraction on it using regexps.
     39# *nix machines. We fall back to performing weak text extraction by using
     40# regular expressions.
    4041
    4142BEGIN {
     
    4950use File::Basename;
    5051
     52# Are we running on WinNT or Win2000 (or later)?
     53my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
     54if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
    5155
    5256sub print_usage
     
    5660    print STDERR "              or text using third-party programs.\n\n";
    5761    print STDERR "  usage: $0 [options] filename\n";
    58     print STDERR "  options:\n\t-type\tdoc|pdf|ps|rtf\n\t-output\thtml|text\n";
    59     print STDERR "\t-timeout\t<max cpu seconds>\n";
     62    print STDERR "  options:\n\t-type\tdoc|pdf|ps|rtf\t(input file type)\n";
     63    print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
     64    print STDERR "\t-output\thtml|text\n";
     65    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
    6066    exit(1);
    6167}
    6268
     69my $faillogfile="";
    6370
    6471sub main
     
    7178    if (!parsargv::parse(\@ARGV,
    7279             'type/(doc|pdf|ps|rtf)/', \$input_type,
     80             '/errlog/.*/', \$faillogfile,
    7381             'output/(html|text)/', \$output_type,
    7482             'timeout/\d+/0',\$timeout,
     
    198206    }
    199207
    200     return &convertAnything($input_filename, $output_filestem, $output_type);
     208# rtf is so ugly that's it's not worth running strings over.
     209# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
     210#    return &convertAnything($input_filename, $output_filestem, $output_type);
     211    return "fail";
    201212}
    202213
     
    232243
    233244sub convertPDF {
    234     ($dirname, $input_filename, $output_filestem, $output_type) = @_;
     245    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
    235246
    236247    my $success = 0;
     
    300311        return "rtf";
    301312        }
     313        $first = 0;
    302314    }
    303315   
     
    308320    }
    309321
    310     $first = 0;
    311 
    312322    }
    313323
     
    320330#
    321331# Each of the following functions attempts to convert a document from
    322 # a specific format to another.  If they succeed yhey return 1 and leave
     332# a specific format to another.  If they succeed they return 1 and leave
    323333# the output document(s) in the appropriate place; if they fail they
    324334# return 0 and delete any working files.
     
    348358    # redirecting STDERR is a bad idea on windows 95/98
    349359    $cmd .= " 2> \"$output_filestem.err\""
    350     if $ENV{'GSDLOS'} !~ /^windows$/i;
     360    if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
    351361
    352362    # execute the command
     363    $!=0;
    353364    if (system($cmd)!=0)
    354365    {
    355     print STDERR "Error executing wv converter: $!. Continuing...\n";
     366    print STDERR "Error executing wv converter:$!\n";
     367    if (-s "$output_filestem.err") {
     368        open (ERRFILE, "<$output_filestem.err");
     369
     370        my $write_to_fail_log=0;
     371        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
     372        {$write_to_fail_log=1;}
     373
     374        my $line;
     375        while ($line=<ERRFILE>) {
     376        if ($line =~ /\w/) {
     377            print STDERR "$line";
     378            print FAILLOG "$line" if ($write_to_fail_log);
     379        }
     380        if ($line !~ m/startup error/) {next;}
     381        print STDERR " (given an invalid .DOC file?)\n";
     382        print FAILLOG " (given an invalid .DOC file?)\n"
     383        if ($write_to_fail_log);
     384       
     385        } # while ERRFILE
     386        close FAILLOG if ($write_to_fail_log);
     387    }
     388    print STDERR "Continuing...\n";
     389    return 0; # we can try any_to_text
    356390    }
    357391
     
    365399        &util::rm("$output_filestem.err") if -e "$output_filestem.err";
    366400        return 1;
    367     } else {
    368         # An error of some sort occurred
    369         &util::rm("$output_filestem.html") if -e "$output_filestem.html";
    370         &util::rm("$output_filestem.err") if -e "$output_filestem.err";
    371     }
    372     }
    373 
     401    }
     402    }
     403   
     404    # If here, an error of some sort occurred
     405    &util::rm("$output_filestem.html") if -e "$output_filestem.html";
     406    if (-e "$output_filestem.err") {
     407    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
     408        open (ERRLOG,"$output_filestem.err");
     409        while (<ERRLOG>) {print FAILLOG $_;}
     410        close FAILLOG;
     411        close ERRLOG;
     412    }
     413    &util::rm("$output_filestem.err");
     414    }
     415   
    374416    return 0;
    375417}
     
    390432
    391433    $cmd .= " 2>\"$output_filestem.err\""
    392         unless $ENV{'GSDLOS'} =~ /^windows$/i;
     434        if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
    393435
    394436
    395437    # execute the command
     438    $!=0;
    396439    if (system($cmd)!=0)
    397440    {
    398     print STDERR "Error executing rtf converter: $!.\n";
     441    print STDERR "Error executing rtf converter $!\n";
    399442    # don't currently bother printing out error log...
    400443    # keep going, in case it still created an HTML file...
    401444    }
    402445
    403     &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
    404 
    405446    # Was the conversion successful?
     447    my $was_successful=0;
    406448    if (-s "$output_filestem.html") {
    407     return 1;
     449    # make sure we have some content other than header
     450    open (HTML, "$output_filestem.html"); # what to do if fail?
     451    my $line;
     452    my $past_header=0;
     453    while ($line=<HTML>) {
     454
     455        if ($past_header == 0) {
     456        if ($line =~ /<body>/) {$past_header=1;}
     457        next;
     458        }
     459
     460        $line =~ s/<[^>]+>//g;
     461        if ($line =~ /\w/ && $past_header) {  # we found some content...
     462        $was_successful=1;
     463        last;
     464        }
     465    }
     466    close HTML;
     467    }
     468
     469    if ($was_successful) {
     470    &util::rm("$output_filestem.err")
     471        if (-e "$output_filestem.err");
     472    # insert the (modified) table of contents, if it exists.
     473    if (-e "${output_filestem}_ToC.html") {
     474        &util::mv("$output_filestem.html","$output_filestem.src");
     475        my $open_failed=0;
     476        open HTMLSRC, "$output_filestem.src" || ++$open_failed;
     477        open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
     478        open HTML, ">$output_filestem.html" || ++$open_failed;
     479       
     480        if ($open_failed) {
     481        close HTMLSRC;
     482        close TOC;
     483        close HTML;
     484        &util::mv("$output_filestem.src","$output_filestem.html");
     485        return 1;
     486        }
     487
     488        # print out header info from src html.
     489        while (($_ = <HTMLSRC>) =~ /\w/) {
     490        print HTML "$_";
     491        }
     492
     493        # print out table of contents, making links relative
     494        <TOC>; <TOC>; # ignore first 2 lines
     495        print HTML scalar(<TOC>); # line 3 = "<ol>\n"
     496        my $line;
     497        while ($line=<TOC>) {
     498        $line =~ s@</body></html>$@@ ; # only last line has this
     499        # make link relative
     500        $line =~ s@href=\"[^\#]+@href=\"@;
     501        print HTML $line;
     502        }
     503        close TOC;
     504
     505        # rest of html src
     506        while (<HTMLSRC>) {
     507        print HTML $_;
     508        }
     509        close HTMLSRC;
     510        close HTML;
     511
     512        &util::rm("${output_filestem}_ToC.html");
     513        &util::rm("${output_filestem}.src");
     514    }
     515    # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
     516    return 1; # success
     517    }
     518
     519    if (-e "$output_filestem.err") {
     520    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
     521    {
     522        print FAILLOG "Error - rtftohtml - couldn't extract text\n";
     523        print FAILLOG " (rtf file might be too recent):\n";
     524        open (ERRLOG, "$output_filestem.err");
     525        while (<ERRLOG>) {print FAILLOG $_;}
     526        close ERRLOG;
     527        close FAILLOG;
     528    }
     529    &util::rm("$output_filestem.err");
    408530    }
    409531
     
    417539
    418540sub pdf_to_html {
    419     ($dirname, $input_filename, $output_filestem) = @_;
     541    my ($dirname, $input_filename, $output_filestem) = @_;
    420542
    421543    $cmd = "";
    422544    if ($timeout) {$cmd = "ulimit -t $timeout;";}
    423     $cmd .= "perl -S pdftohtml.pl -F ";
     545    $cmd .= "perl -S pdftohtml.pl ";
    424546    $cmd .= " \"$input_filename\" \"$output_filestem\"";
     547   
     548    if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {
     549    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
     550    } else {
     551    $cmd .= " > \"$output_filestem.err\"";
     552    }
     553
    425554    $!=0;
    426555
     
    428557    if ($retval!=0)
    429558    {
    430     print STDERR "Error executing $cmd";
     559    print STDERR "Error executing pdftohtml.pl";
    431560    if ($!) {print STDERR ": $!";}
    432561    print STDERR "\n";
     
    440569    if (-s "$output_filestem.err") {
    441570        open (ERRLOG, "$output_filestem.err") || die "$!";
    442         print STDERR "pdftohtml:\n";
     571        print STDERR "pdftohtml error log:\n";
    443572        while (<ERRLOG>) {
    444573        print STDERR "$_";
     
    447576    }
    448577    &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
    449     &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
     578    if (-e "$output_filestem.err") {
     579        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
     580        {
     581        open (ERRLOG, "$output_filestem.err");
     582        while (<ERRLOG>) {print FAILLOG $_;}
     583        close ERRLOG;
     584        close FAILLOG;
     585        }   
     586    &util::rm("$output_filestem.err");
     587    }
    450588    return 0;
    451589    }
     
    459597
    460598sub pdf_to_text {
    461     ($dirname, $input_filename, $output_filestem) = @_;
     599    my ($dirname, $input_filename, $output_filestem) = @_;
    462600
    463601    my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
    464     $cmd .= " 2> \"$output_filestem.err\"";
     602
     603    if ($ENV{'GSDLOS'} !~ /^windows$/i) {
     604    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
     605    } else {
     606    $cmd .= " > \"$output_filestem.err\"";
     607    }
    465608   
    466609    if (system($cmd)!=0)
     
    468611    print STDERR "Error executing $cmd: $!\n";
    469612    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
    470     &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
    471     return 0;
     613    }
     614
     615    # make sure there is some extracted text.
     616    if (-e "$output_filestem.text") {
     617    open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
     618    binmode(EXTR_TEXT); # just in case...
     619    my $line="";
     620    my $seen_text=0;
     621    while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
     622        if ($line=~ /\w/) {$seen_text=1;}
     623    }
     624    close EXTR_TEXT;
     625    if ($seen_text==0) { # no text was extracted
     626        print STDERR "Error: pdftotext found no text\n";
     627        &util::rm("$output_filestem.text");
     628    }
    472629    }
    473630
     
    478635    if (-s "$output_filestem.err") {
    479636        open (ERRLOG, "$output_filestem.err") || die "$!";
    480         print STDERR "pdftotext:\n";
     637        print STDERR "pdftotext error log:\n";
    481638        while (<ERRLOG>) {
    482639        print STDERR "$_";
     
    487644    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
    488645    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
    489     &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
    490 
     646    if (-e "$output_filestem.err") {
     647        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
     648        {
     649        open (ERRLOG,"$output_filestem.err");
     650        while (<ERRLOG>) {print FAILLOG $_;}
     651        close ERRLOG;
     652        close FAILLOG;
     653        }
     654        &util::rm("$output_filestem.err");
     655    }
    491656    return 0;
    492657    }
     
    537702    if ($error ne "")
    538703    {
    539     print STDERR "PSPlug: WARNING: Error executing gs: $error\n";
     704    print STDERR "Warning: Error executing gs: $error\n";
    540705    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
     706
     707    if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
     708    {
     709        print FAILLOG "gs - $error\n";
     710        if (-e "$output_filestem.err") {
     711        open(ERRLOG, "$output_filestem.err");
     712        while (<ERRLOG>) {print FAILLOG $_;}
     713        close ERRLOG;
     714        }
     715        close FAILLOG;
     716    }
    541717    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
     718
    542719
    543720    # Fine then. We'll just do a lousy job by ourselves...
     
    545722    # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
    546723    #
    547     print STDERR "PSPlug: Stripping text from postscript\n";
     724    print STDERR "Stripping text from postscript\n";
    548725    my $errorcode=0;
    549726    open (IN, "$input_filename")
     
    554731   
    555732    my $text="";  # this is for whole .ps file...
    556     while (<IN>) {
    557         $text.=$_;
    558     }
     733    $text = join('', <IN>); # see man perlport, under "System Resources"
    559734    close IN;
    560735
    561736    # Make sure this is a ps file...
    562737    if ($text !~ /^%!/) {
    563         print STDERR "Bad postscript header: not %!\n";
     738        print STDERR "Bad postscript header: not '%!'\n";
     739        if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
     740        {
     741        print FAILLOG "Bad postscript header: not '%!'\n";
     742        close FAILLOG;
     743        }
    564744        return 0;
    565745    }
     
    666846    print HTML "</head><body>\n\n";
    667847
    668     while (<TEXT>) {
    669     print HTML "<p> ", $_;
     848    my $line;
     849    while ($line=<TEXT>) {
     850    $line =~ s/</&lt;/g;
     851    $line =~ s/>/&gt;/g;
     852    if ($line =~ /^\s*$/) {
     853        print HTML "<p>";
     854    } else {
     855        print HTML "<br> ", $line;
     856    }
    670857    }
    671858    print HTML "\n</body></html>\n";
     
    680867# Convert any file to TEXT with a crude perl implementation of the
    681868# UNIX strings command.
     869# Note - this assumes ascii charsets :(     (jrm21)
    682870
    683871sub any_to_text {
    684872    ($input_filename, $output_filestem) = @_;
    685873
    686     open(IN, "<$input_filename");
     874    open(IN, "<$input_filename") || return 0;
    687875    binmode(IN);
    688     open(OUT, ">$output_filestem.text");
     876    open(OUT, ">$output_filestem.text") || return 0;
    689877
    690878    my ($line);
    691     my $dgcount = 0;
     879    my $output_line_count = 0;
    692880    while (<IN>) {
    693881    $line = $_;
     
    710898    if ($line =~ /[^\n ]/) {
    711899        print OUT $line;
     900        ++$output_line_count;
    712901    }
    713902    }
     
    716905    close IN;
    717906
    718     return 1;
    719 }
     907    if ($output_line_count) { # try to protect against binary only formats
     908    return 1;
     909    }
     910
     911    &util::rm("$output_filestem.text");
     912    return 0;
     913
     914}
  • trunk/gsdl/bin/script/import.pl

    r2531 r2755  
    8080    print STDOUT "   -collectdir directory  Collection directory (defaults to " .
    8181    &util::filename_cat ($ENV{'GSDLHOME'}, "collect") . ")\n";
    82     print STDOUT "   -out                   Filename or handle to print output status to.\n";
    83     print STDOUT "                          The default is STDERR\n\n";
     82    print STDOUT "   -out name              Filename or handle to print output status to.\n";
     83    print STDOUT "   -faillog name          Filename to log import failures to.\n";
     84    print STDOUT "                          The default is <collectdir>/colname/etc/fail.log\n\n";
    8485    print STDOUT "  [Type \"perl -S import.pl | more\" if this help text scrolled off your screen]";
    8586    print STDOUT "\n" unless $ENV{'GSDLOS'} =~ /^windows$/i;
     
    9394    $maxdocs, $collection, $configfilename, $collectcfg,
    9495    $pluginfo, $sortmeta, $archive_info_filename,
    95     $archive_info, $processor, $out, $collectdir);
     96    $archive_info, $processor, $out, $faillogname, $collectdir);
    9697
    9798    # note that no defaults are passed for most options as they're set
     
    110111             'maxdocs/^\-?\d+/', \$maxdocs,
    111112             'collectdir/.*/', \$collectdir,
    112              'out/.*/STDERR', \$out)) {
     113             'out/.*/STDERR', \$out,
     114             'faillog/.*/', \$faillogname)) {
    113115    &print_usage();
    114116    die "\n";
     
    131133    die "\n";
    132134    }
     135   
     136    # check and/or set fail log file
     137    if ($faillogname eq "") {
     138    $faillogname="$ENV{GSDLCOLLECTDIR}/etc/fail.log";
     139    } else {
     140    my $can_open=1;
     141    open (TESTOPEN, ">$faillogname") || ($can_open=0);
     142    close (TESTOPEN);
     143    if ($can_open==0) {
     144        warn "fail.log - cannot write to \"$faillogname\", using default\n \"$ENV{GSDLCOLLECTDIR}/etc/fail.log\" instead.\n";
     145        $faillogname="$ENV{GSDLCOLLECTDIR}/etc/fail.log";
     146    }
     147    }
     148    # test that default is writable...
     149    if ($faillogname eq "$ENV{GSDLCOLLECTDIR}/etc/fail.log") {
     150    my $can_open=1;
     151    open (TESTOPEN, ">$faillogname") || ($can_open=0);
     152    close (TESTOPEN);
     153    if ($can_open==0) {
     154        warn "warning - cannot write to \"$faillogname\".\n";
     155        $faillogname="";
     156    }
     157    }
     158
    133159
    134160    # check sortmeta
     
    149175    # get the list of plugins for this collection and set any options that
    150176    # were specified in the collect.cfg (all import.pl options except
    151     # -collectdir and -out may be specified in the collect.cfg (these
     177    # -collectdir, -out and -faillog may be specified in the collect.cfg (these
    152178    # options must be known before we read the collect.cfg))
    153179    my $plugins = [];
     
    232258
    233259    # load all the plugins
    234     $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out);
     260    $pluginfo = &plugin::load_plugins ($plugins, $verbosity, $out, $faillogname);
    235261    if (scalar(@$pluginfo) == 0) {
    236262    print $out "No plugins were loaded.\n";
  • trunk/gsdl/bin/script/pdftohtml.pl

    r2743 r2755  
    4545# note - we don't actually ever use most of these options...
    4646print STDERR 
    47     ("pdftohtml version 0.22 - modified for NZDL use\n",
     47    ("pdftohtml.pl wrapper for pdftohtml version 0.22, modified for GSDL use.\n",
    4848     "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
    49      "  -f <int>      : first page to convert\n",
    50      "  -l <int>      : last page to convert\n",
    51      "  -d <dir>      : target directory (default: basename of pdf-file)\n",
    52      "  -o <file>     : name of output file; - means stdout (default index.html)\n",
    53      "  -q            : don't print any messages or errors\n",
    54      "  -h            : print this usage information\n",
    55      "  -p            : exchange .pdf links by .html\n",
    56 # these options now have no effect in gs-custom pdftohtml
    57 #     "  -c            : generate complex HTML document\n",
    58 #     "  -F            : don't use frames in HTML document\n",
    59      "  -i            : ignore images\n",
    60      "  -e <string>   : set extension for images (in the Html-file) (default png)\n"
     49     "Options:\n",
     50     "\t-i\tignore images (don't extract)\n",
     51     "\t-a\tallow images only (continue even if no text is present)\n"
    6152     );
    6253exit (1);
     
    6556sub main {
    6657    my (@ARGV) = @_;
    67     my ($first,$last,$target_dir,$out_file,$img_ext,
    68     $optq,$opth,$optp,$optF,$opti);
     58    my ($allow_no_text,$ignore_images);
    6959   
    7060    # read command-line arguments so that
    7161    # you can change the command in this script
    7262    if (!parsargv::parse(\@ARGV,
    73              'f/\d+/1', \$first,
    74              'l/\d+/1', \$last,
    75              'd/[\S]*/', \$target_dir,
    76              'o/[\S]*/', \$out_file,
    77              'e/[\S]*/', \$img_ext,
    78              'q', \$optq,
    79              'h', \$opth,
    80              'p', \$optp,
    81 #            'c', \$optc,
    82              'F', \$optF,
    83              'i', \$opti
     63             'a', \$allow_no_text,
     64             'i', \$ignore_images
    8465             ))
    8566    {
     
    119100    $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
    120101
     102    $cmd .= " -i" if ($ignore_images);
    121103    $cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\"";
    122     $cmd .= " > \"$output_filestem.out\"";
    123 
    124     # attempting to redirect STDERR on windows 95/98 is a bad idea
    125     $cmd .= " 2> \"$output_filestem.err\""
    126     if $ENV{'GSDLOS'} !~ /^windows$/i;
    127104
    128105# system() returns -1 if it can't run, otherwise it's $cmds ret val.
    129106    # note we return 0 if the file is "encrypted"
     107    $!=0;
    130108    if (system($cmd)!=0) {
    131     print STDERR "Error executing $cmd: $!\n";
     109    print STDERR "pdftohtml error for $input_filename $!\n";
    132110    # leave these for gsConvert.pl...
    133111    #&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
    134112    #&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
    135     return 0;
     113    return 1;
    136114    }
    137115
    138116    if (! -e "$output_filestem.html") {
    139     return 0;
     117    return 1;
    140118    }
    141119
    142120# post-process to remove </b><b> and </i><i>, as these break up
    143121# words, screwing up indexing and searching.
     122# At the same time, check that our .html file has some textual content.
    144123    &util::mv("$output_filestem.html","$output_filestem.html.tmp");
     124    $!=0;
    145125    open INFILE, "$output_filestem.html.tmp" ||
    146126    die "Couldn't open file: $!";
     
    148128    die "Couldn't open file for writing: $!";
    149129    my $line;
     130    my $seen_textual_content=$allow_no_text;
    150131    while ($line=<INFILE>) {
    151132    $line =~ s#</b><b>##g;
    152133    $line =~ s#</i><i>##g;
    153134    $line =~ s#\\#\\\\#g; # until macro language parsing is fixed...
     135# check for any extracted text
     136    if ($seen_textual_content == 0) {
     137        my $tmp_line=$line;
     138        $tmp_line =~ s/<[^>]*>//g;
     139        $tmp_line =~ s/Page\s\d+//;
     140        $tmp_line =~ s/\s*//g;
     141        if ($tmp_line ne "") {
     142        $seen_textual_content=1;
     143        }
     144    }
     145
    154146# escape underscores, but not if they're inside tags (eg img/href names)
    155147    my $inatag = 0; # allow multi-line tags
     
    178170    &util::rm("$output_filestem.html.tmp");
    179171
    180 
    181172    # Need to convert images from PPM format to PNG format
    182173    my @images;
     
    192183    }
    193184    close IMAGES;
     185    &util::rm("${directory}image.log") if (-e "${directory}image.log");
     186
     187    # no need to go any further if there is no text extracted from pdf.
     188    if ($seen_textual_content == 0) {
     189    print STDERR "Error: PDF contains no extractable text\n";
     190    # remove images...
     191    for $image (@images) {
     192        chomp($image);
     193        &util::rm("${directory}$image");
     194    }
     195    return 1;
     196    }
     197
     198
    194199
    195200    for $image (@images) {
     
    200205        if (system($cmd)!=0) {
    201206        print STDERR "Error executing $cmd\n";
    202         #return 0; # not sure about whether to leave this one in or take it out
     207        #return 1; # not sure about whether to leave this one in or take it out
    203208        next;
    204209        }
     
    211216        if (system($cmd)!=0) {
    212217            print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
    213             #return 0; # not sure about whether to leave this one in or take it out
     218            #return 1; # not sure about whether to leave this one in or take it out
    214219            next;
    215220        }
     
    219224    }
    220225
    221     return 1;
     226    return 0;
    222227}
    223228
    224 # indicate our error status
    225 if (&main(@ARGV)) {exit 0;}
    226 exit 1;
     229# indicate our error status, 0 = success
     230exit (&main(@ARGV));
     231
  • trunk/gsdl/perllib/plugin.pm

    r1587 r2755  
    3030sub load_plugins {
    3131    my ($plugin_list) = shift @_;
    32     ($verbosity, $outhandle) = @_; # globals
     32    ($verbosity, $outhandle, $faillogname) = @_; # globals
    3333    my @plugin_objects = ();
    3434
     
    5858   
    5959    # initialize plugin
    60     $plugobj->init($verbosity, $outhandle);
     60    $plugobj->init($verbosity, $outhandle, $faillogname);
    6161
    6262    # add this object to the list
  • trunk/gsdl/perllib/plugins/BasPlug.pm

    r2751 r2755  
    175175sub init {
    176176    my $self = shift (@_);
    177     my ($verbosity, $outhandle) = @_;
     177    my ($verbosity, $outhandle, $faillogname) = @_;
    178178
    179179    # verbosity is passed through from the processor
     
    182182    # as is the outhandle ...
    183183    $self->{'outhandle'} = $outhandle if defined $outhandle;
     184    $self->{'faillogname'} = $faillogname;
    184185
    185186    # set process_exp and block_exp to defaults unless they were
  • trunk/gsdl/perllib/plugins/ConvertToPlug.pm

    r2751 r2755  
    168168    print $outhandle "Converting $tailname$suffix to $convert_to format\n";
    169169    }
     170
     171    my $errlog = &util::filename_cat($tmp_dirname, "err.log");
    170172   
    171173    # Execute the conversion command and get the type of the result,
    172174    # making sure the converter gives us the appropriate output type
    173175    my $output_type = lc($convert_to);
    174     my $cmd = "perl -S gsConvert.pl -verbose $verbosity -output $output_type \"$tmp_filename\"";
     176    my $cmd = "perl -S gsConvert.pl -verbose $verbosity -errlog \"$errlog\" -output $output_type \"$tmp_filename\"";
    175177    $output_type = `$cmd`;
    176178
     
    182184    if ($output_type eq "fail") {
    183185    print $outhandle "Could not convert $tailname$suffix to $convert_to format\n";
     186    if ($self->{'faillogname'} ne "" && -s "$errlog") {
     187        open(SAVELOG, ">>$self->{'faillogname'}");
     188        open(ERRLOG, "$errlog");
     189        print SAVELOG "$tailname$suffix (converting to $convert_to) failed:\n";
     190        while (<ERRLOG>) {
     191        print SAVELOG "$_";
     192        }
     193        close ERRLOG;
     194        print SAVELOG "\n";
     195        close SAVELOG;
     196    }
     197    &util::rm("$errlog") if (-e "$errlog");
    184198    return "";
    185199    }
Note: See TracChangeset for help on using the changeset viewer.