Ignore:
Timestamp:
2001-09-26T10:43:44+12:00 (23 years ago)
Author:
jrm21
Message:

import.pl now takes an option for saving file conversion failures to a log.
By default, import.pl will use <collectdir>/etc/fail.log. Currently only
the plugins based on ConvertToPlug will do this. Not yet tested on Win9X.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/pdftohtml.pl

    r2743 r2755  
    4545# note - we don't actually ever use most of these options...
    4646print STDERR 
    47     ("pdftohtml version 0.22 - modified for NZDL use\n",
     47    ("pdftohtml.pl wrapper for pdftohtml version 0.22, modified for GSDL use.\n",
    4848     "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
    49      "  -f <int>      : first page to convert\n",
    50      "  -l <int>      : last page to convert\n",
    51      "  -d <dir>      : target directory (default: basename of pdf-file)\n",
    52      "  -o <file>     : name of output file; - means stdout (default index.html)\n",
    53      "  -q            : don't print any messages or errors\n",
    54      "  -h            : print this usage information\n",
    55      "  -p            : exchange .pdf links by .html\n",
    56 # these options now have no effect in gs-custom pdftohtml
    57 #     "  -c            : generate complex HTML document\n",
    58 #     "  -F            : don't use frames in HTML document\n",
    59      "  -i            : ignore images\n",
    60      "  -e <string>   : set extension for images (in the Html-file) (default png)\n"
     49     "Options:\n",
     50     "\t-i\tignore images (don't extract)\n",
     51     "\t-a\tallow images only (continue even if no text is present)\n"
    6152     );
    6253exit (1);
     
    6556sub main {
    6657    my (@ARGV) = @_;
    67     my ($first,$last,$target_dir,$out_file,$img_ext,
    68     $optq,$opth,$optp,$optF,$opti);
     58    my ($allow_no_text,$ignore_images);
    6959   
    7060    # read command-line arguments so that
    7161    # you can change the command in this script
    7262    if (!parsargv::parse(\@ARGV,
    73              'f/\d+/1', \$first,
    74              'l/\d+/1', \$last,
    75              'd/[\S]*/', \$target_dir,
    76              'o/[\S]*/', \$out_file,
    77              'e/[\S]*/', \$img_ext,
    78              'q', \$optq,
    79              'h', \$opth,
    80              'p', \$optp,
    81 #            'c', \$optc,
    82              'F', \$optF,
    83              'i', \$opti
     63             'a', \$allow_no_text,
     64             'i', \$ignore_images
    8465             ))
    8566    {
     
    119100    $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
    120101
     102    $cmd .= " -i" if ($ignore_images);
    121103    $cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\"";
    122     $cmd .= " > \"$output_filestem.out\"";
    123 
    124     # attempting to redirect STDERR on windows 95/98 is a bad idea
    125     $cmd .= " 2> \"$output_filestem.err\""
    126     if $ENV{'GSDLOS'} !~ /^windows$/i;
    127104
    128105# system() returns -1 if it can't run, otherwise it's $cmds ret val.
    129106    # note we return 0 if the file is "encrypted"
     107    $!=0;
    130108    if (system($cmd)!=0) {
    131     print STDERR "Error executing $cmd: $!\n";
     109    print STDERR "pdftohtml error for $input_filename $!\n";
    132110    # leave these for gsConvert.pl...
    133111    #&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
    134112    #&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
    135     return 0;
     113    return 1;
    136114    }
    137115
    138116    if (! -e "$output_filestem.html") {
    139     return 0;
     117    return 1;
    140118    }
    141119
    142120# post-process to remove </b><b> and </i><i>, as these break up
    143121# words, screwing up indexing and searching.
     122# At the same time, check that our .html file has some textual content.
    144123    &util::mv("$output_filestem.html","$output_filestem.html.tmp");
     124    $!=0;
    145125    open INFILE, "$output_filestem.html.tmp" ||
    146126    die "Couldn't open file: $!";
     
    148128    die "Couldn't open file for writing: $!";
    149129    my $line;
     130    my $seen_textual_content=$allow_no_text;
    150131    while ($line=<INFILE>) {
    151132    $line =~ s#</b><b>##g;
    152133    $line =~ s#</i><i>##g;
    153134    $line =~ s#\\#\\\\#g; # until macro language parsing is fixed...
     135# check for any extracted text
     136    if ($seen_textual_content == 0) {
     137        my $tmp_line=$line;
     138        $tmp_line =~ s/<[^>]*>//g;
     139        $tmp_line =~ s/Page\s\d+//;
     140        $tmp_line =~ s/\s*//g;
     141        if ($tmp_line ne "") {
     142        $seen_textual_content=1;
     143        }
     144    }
     145
    154146# escape underscores, but not if they're inside tags (eg img/href names)
    155147    my $inatag = 0; # allow multi-line tags
     
    178170    &util::rm("$output_filestem.html.tmp");
    179171
    180 
    181172    # Need to convert images from PPM format to PNG format
    182173    my @images;
     
    192183    }
    193184    close IMAGES;
     185    &util::rm("${directory}image.log") if (-e "${directory}image.log");
     186
     187    # no need to go any further if there is no text extracted from pdf.
     188    if ($seen_textual_content == 0) {
     189    print STDERR "Error: PDF contains no extractable text\n";
     190    # remove images...
     191    for $image (@images) {
     192        chomp($image);
     193        &util::rm("${directory}$image");
     194    }
     195    return 1;
     196    }
     197
     198
    194199
    195200    for $image (@images) {
     
    200205        if (system($cmd)!=0) {
    201206        print STDERR "Error executing $cmd\n";
    202         #return 0; # not sure about whether to leave this one in or take it out
     207        #return 1; # not sure about whether to leave this one in or take it out
    203208        next;
    204209        }
     
    211216        if (system($cmd)!=0) {
    212217            print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
    213             #return 0; # not sure about whether to leave this one in or take it out
     218            #return 1; # not sure about whether to leave this one in or take it out
    214219            next;
    215220        }
     
    219224    }
    220225
    221     return 1;
     226    return 0;
    222227}
    223228
    224 # indicate our error status
    225 if (&main(@ARGV)) {exit 0;}
    226 exit 1;
     229# indicate our error status, 0 = success
     230exit (&main(@ARGV));
     231
Note: See TracChangeset for help on using the changeset viewer.