Context Navigation

← Previous Change
Next Change →

pdftohtml.pl

Timestamp:

2001-09-26T10:43:44+12:00 (23 years ago)

Author:

jrm21

Message:

import.pl now takes an option for saving file conversion failures to a log.
By default, import.pl will use <collectdir>/etc/fail.log. Currently only
the plugins based on ConvertToPlug will do this. Not yet tested on Win9X.

File:

: 1 edited

trunk/gsdl/bin/script/pdftohtml.pl (modified) (9 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/bin/script/pdftohtml.pl

-              r2743
+              r2755
 # note - we don't actually ever use most of these options...
 print STDERR
     ("pdftohtml version 0.22 - modified for NZDL use\n",
+    ("pdftohtml.pl wrapper for pdftohtml version 0.22, modified for GSDL use.\n",
      "Usage: pdftohtml [options] <PDF-file> <html-file>\n",
+     "  -f <int>      : first page to convert\n",
+     "  -l <int>      : last page to convert\n",
+     "  -d <dir>      : target directory (default: basename of pdf-file)\n",
+     "  -o <file>     : name of output file; - means stdout (default index.html)\n",
+     "  -q            : don't print any messages or errors\n",
+     "  -h            : print this usage information\n",
+     "  -p            : exchange .pdf links by .html\n",
+# these options now have no effect in gs-custom pdftohtml
+#     "  -c            : generate complex HTML document\n",
+#     "  -F            : don't use frames in HTML document\n",
+     "  -i            : ignore images\n",
+     "  -e <string>   : set extension for images (in the Html-file) (default png)\n"
+     "Options:\n",
+     "\t-i\tignore images (don't extract)\n",
+     "\t-a\tallow images only (continue even if no text is present)\n"
      );
 exit (1);
 …
 sub main {
     my (@ARGV) = @_;
+    my ($first,$last,$target_dir,$out_file,$img_ext,
+    $optq,$opth,$optp,$optF,$opti);
+    my ($allow_no_text,$ignore_images);
     # read command-line arguments so that
     # you can change the command in this script
     if (!parsargv::parse(\@ARGV,
+             'f/\d+/1', \$first,
+             'l/\d+/1', \$last,
+             'd/[\S]*/', \$target_dir,
+             'o/[\S]*/', \$out_file,
+             'e/[\S]*/', \$img_ext,
+             'q', \$optq,
+             'h', \$opth,
+             'p', \$optp,
+#            'c', \$optc,
+             'F', \$optF,
+             'i', \$opti
+             'a', \$allow_no_text,
+             'i', \$ignore_images
              ))
+    {
 …
     $cmd = "pdftohtml" if ($ENV{'GSDLOS'} =~ /^windows$/);
+    $cmd .= " -i" if ($ignore_images);
     $cmd .= " -noframes \"$input_filename\" \"$output_filestem.html\"";
-    $cmd .= " > \"$output_filestem.out\"";
-    # attempting to redirect STDERR on windows 95/98 is a bad idea
-    $cmd .= " 2> \"$output_filestem.err\""
-    if $ENV{'GSDLOS'} !~ /^windows$/i;
 # system() returns -1 if it can't run, otherwise it's $cmds ret val.
     # note we return 0 if the file is "encrypted"
+    $!=0;
     if (system($cmd)!=0) {
     print STDERR "Error executing $cmd: $!\n";
+    print STDERR "pdftohtml error for $input_filename $!\n";
     # leave these for gsConvert.pl...
     #&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
     #&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
     return 0;
+    return 1;
+    }
     if (! -e "$output_filestem.html") {
     return 0;
+    return 1;
+    }
 # post-process to remove </b><b> and </i><i>, as these break up
 # words, screwing up indexing and searching.
+# At the same time, check that our .html file has some textual content.
     &util::mv("$output_filestem.html","$output_filestem.html.tmp");
+    $!=0;
     open INFILE, "$output_filestem.html.tmp" ||
     die "Couldn't open file: $!";
 …
     die "Couldn't open file for writing: $!";
     my $line;
+    my $seen_textual_content=$allow_no_text;
     while ($line=<INFILE>) {
     $line =~ s#</b><b>##g;
     $line =~ s#</i><i>##g;
     $line =~ s#\\#\\\\#g; # until macro language parsing is fixed...
+# check for any extracted text
+    if ($seen_textual_content == 0) {
+        my $tmp_line=$line;
+        $tmp_line =~ s/<[^>]*>//g;
+        $tmp_line =~ s/Page\s\d+//;
+        $tmp_line =~ s/\s*//g;
+        if ($tmp_line ne "") {
+        $seen_textual_content=1;
+        }
+    }
 # escape underscores, but not if they're inside tags (eg img/href names)
     my $inatag = 0; # allow multi-line tags
 …
     &util::rm("$output_filestem.html.tmp");
     # Need to convert images from PPM format to PNG format
     my @images;
 …
+    }
     close IMAGES;
+    &util::rm("${directory}image.log") if (-e "${directory}image.log");
+    # no need to go any further if there is no text extracted from pdf.
+    if ($seen_textual_content == 0) {
+    print STDERR "Error: PDF contains no extractable text\n";
+    # remove images...
+    for $image (@images) {
+        chomp($image);
+        &util::rm("${directory}$image");
+    }
+    return 1;
+    }
     for $image (@images) {
 …
         if (system($cmd)!=0) {
         print STDERR "Error executing $cmd\n";
         #return 0; # not sure about whether to leave this one in or take it out
+        #return 1; # not sure about whether to leave this one in or take it out
         next;
+        }
 …
         if (system($cmd)!=0) {
             print STDERR "Cannot convert $image into PNG format (tried `pnmtopng' and `convert')...\n";
             #return 0; # not sure about whether to leave this one in or take it out
+            #return 1; # not sure about whether to leave this one in or take it out
             next;
+        }
 …
+    }
     return 1;
+    return 0;
+}
 # indicate our error status
+if (&main(@ARGV)) {exit 0;}
+exit 1;
+# indicate our error status, 0 = success
+exit (&main(@ARGV));

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 2755 for trunk/gsdl/bin/script/pdftohtml.pl

Legend:

trunk/gsdl/bin/script/pdftohtml.pl

Download in other formats: