Ignore:
Timestamp:
2001-09-26T10:43:44+12:00 (23 years ago)
Author:
jrm21
Message:

import.pl now takes an option for saving file conversion failures to a log.
By default, import.pl will use <collectdir>/etc/fail.log. Currently only
the plugins based on ConvertToPlug will do this. Not yet tested on Win9X.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/gsConvert.pl

    r2656 r2755  
    2828
    2929# gsConvert.pl converts documents in a range of formats to HTML or TEXT
    30 # by exploiting third-party programs.  These are usually found in the
    31 # $GSDLHOME/packages directory.
    32 #
    33 # Currently, we can convert Microsoft Word and Adobe PDF using specialised
    34 # conversion utilities.   We can convery any file to text with a perl
    35 # implementation of the UNIX strings command.
     30# by exploiting third-party programs.  The sources of these are usually found
     31# in the $GSDLHOME/packages directory, and the executables should live in
     32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
     33#
     34# Currently, we can convert Microsoft Word, RTF, Adobe PDF and PostScript
     35# using specialised conversion utilities.   We can try to convert any file to
     36# text with a perl implementation of the UNIX strings command.
    3637#
    3738# We try to convert Postscript files to text using "gs" which is often on
    38 # *nix machines. If it isn't (or we're running on Windoze), we do some feeble
    39 # text extraction on it using regexps.
     39# *nix machines. We fall back to performing weak text extraction by using
     40# regular expressions.
    4041
    4142BEGIN {
     
    4950use File::Basename;
    5051
     52# Are we running on WinNT or Win2000 (or later)?
     53my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
     54if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
    5155
    5256sub print_usage
     
    5660    print STDERR "              or text using third-party programs.\n\n";
    5761    print STDERR "  usage: $0 [options] filename\n";
    58     print STDERR "  options:\n\t-type\tdoc|pdf|ps|rtf\n\t-output\thtml|text\n";
    59     print STDERR "\t-timeout\t<max cpu seconds>\n";
     62    print STDERR "  options:\n\t-type\tdoc|pdf|ps|rtf\t(input file type)\n";
     63    print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
     64    print STDERR "\t-output\thtml|text\n";
     65    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
    6066    exit(1);
    6167}
    6268
     69my $faillogfile="";
    6370
    6471sub main
     
    7178    if (!parsargv::parse(\@ARGV,
    7279             'type/(doc|pdf|ps|rtf)/', \$input_type,
     80             '/errlog/.*/', \$faillogfile,
    7381             'output/(html|text)/', \$output_type,
    7482             'timeout/\d+/0',\$timeout,
     
    198206    }
    199207
    200     return &convertAnything($input_filename, $output_filestem, $output_type);
     208# rtf is so ugly that's it's not worth running strings over.
     209# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
     210#    return &convertAnything($input_filename, $output_filestem, $output_type);
     211    return "fail";
    201212}
    202213
     
    232243
    233244sub convertPDF {
    234     ($dirname, $input_filename, $output_filestem, $output_type) = @_;
     245    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
    235246
    236247    my $success = 0;
     
    300311        return "rtf";
    301312        }
     313        $first = 0;
    302314    }
    303315   
     
    308320    }
    309321
    310     $first = 0;
    311 
    312322    }
    313323
     
    320330#
    321331# Each of the following functions attempts to convert a document from
    322 # a specific format to another.  If they succeed yhey return 1 and leave
     332# a specific format to another.  If they succeed they return 1 and leave
    323333# the output document(s) in the appropriate place; if they fail they
    324334# return 0 and delete any working files.
     
    348358    # redirecting STDERR is a bad idea on windows 95/98
    349359    $cmd .= " 2> \"$output_filestem.err\""
    350     if $ENV{'GSDLOS'} !~ /^windows$/i;
     360    if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
    351361
    352362    # execute the command
     363    $!=0;
    353364    if (system($cmd)!=0)
    354365    {
    355     print STDERR "Error executing wv converter: $!. Continuing...\n";
     366    print STDERR "Error executing wv converter:$!\n";
     367    if (-s "$output_filestem.err") {
     368        open (ERRFILE, "<$output_filestem.err");
     369
     370        my $write_to_fail_log=0;
     371        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
     372        {$write_to_fail_log=1;}
     373
     374        my $line;
     375        while ($line=<ERRFILE>) {
     376        if ($line =~ /\w/) {
     377            print STDERR "$line";
     378            print FAILLOG "$line" if ($write_to_fail_log);
     379        }
     380        if ($line !~ m/startup error/) {next;}
     381        print STDERR " (given an invalid .DOC file?)\n";
     382        print FAILLOG " (given an invalid .DOC file?)\n"
     383        if ($write_to_fail_log);
     384       
     385        } # while ERRFILE
     386        close FAILLOG if ($write_to_fail_log);
     387    }
     388    print STDERR "Continuing...\n";
     389    return 0; # we can try any_to_text
    356390    }
    357391
     
    365399        &util::rm("$output_filestem.err") if -e "$output_filestem.err";
    366400        return 1;
    367     } else {
    368         # An error of some sort occurred
    369         &util::rm("$output_filestem.html") if -e "$output_filestem.html";
    370         &util::rm("$output_filestem.err") if -e "$output_filestem.err";
    371     }
    372     }
    373 
     401    }
     402    }
     403   
     404    # If here, an error of some sort occurred
     405    &util::rm("$output_filestem.html") if -e "$output_filestem.html";
     406    if (-e "$output_filestem.err") {
     407    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
     408        open (ERRLOG,"$output_filestem.err");
     409        while (<ERRLOG>) {print FAILLOG $_;}
     410        close FAILLOG;
     411        close ERRLOG;
     412    }
     413    &util::rm("$output_filestem.err");
     414    }
     415   
    374416    return 0;
    375417}
     
    390432
    391433    $cmd .= " 2>\"$output_filestem.err\""
    392         unless $ENV{'GSDLOS'} =~ /^windows$/i;
     434        if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
    393435
    394436
    395437    # execute the command
     438    $!=0;
    396439    if (system($cmd)!=0)
    397440    {
    398     print STDERR "Error executing rtf converter: $!.\n";
     441    print STDERR "Error executing rtf converter $!\n";
    399442    # don't currently bother printing out error log...
    400443    # keep going, in case it still created an HTML file...
    401444    }
    402445
    403     &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
    404 
    405446    # Was the conversion successful?
     447    my $was_successful=0;
    406448    if (-s "$output_filestem.html") {
    407     return 1;
     449    # make sure we have some content other than header
     450    open (HTML, "$output_filestem.html"); # what to do if fail?
     451    my $line;
     452    my $past_header=0;
     453    while ($line=<HTML>) {
     454
     455        if ($past_header == 0) {
     456        if ($line =~ /<body>/) {$past_header=1;}
     457        next;
     458        }
     459
     460        $line =~ s/<[^>]+>//g;
     461        if ($line =~ /\w/ && $past_header) {  # we found some content...
     462        $was_successful=1;
     463        last;
     464        }
     465    }
     466    close HTML;
     467    }
     468
     469    if ($was_successful) {
     470    &util::rm("$output_filestem.err")
     471        if (-e "$output_filestem.err");
     472    # insert the (modified) table of contents, if it exists.
     473    if (-e "${output_filestem}_ToC.html") {
     474        &util::mv("$output_filestem.html","$output_filestem.src");
     475        my $open_failed=0;
     476        open HTMLSRC, "$output_filestem.src" || ++$open_failed;
     477        open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
     478        open HTML, ">$output_filestem.html" || ++$open_failed;
     479       
     480        if ($open_failed) {
     481        close HTMLSRC;
     482        close TOC;
     483        close HTML;
     484        &util::mv("$output_filestem.src","$output_filestem.html");
     485        return 1;
     486        }
     487
     488        # print out header info from src html.
     489        while (($_ = <HTMLSRC>) =~ /\w/) {
     490        print HTML "$_";
     491        }
     492
     493        # print out table of contents, making links relative
     494        <TOC>; <TOC>; # ignore first 2 lines
     495        print HTML scalar(<TOC>); # line 3 = "<ol>\n"
     496        my $line;
     497        while ($line=<TOC>) {
     498        $line =~ s@</body></html>$@@ ; # only last line has this
     499        # make link relative
     500        $line =~ s@href=\"[^\#]+@href=\"@;
     501        print HTML $line;
     502        }
     503        close TOC;
     504
     505        # rest of html src
     506        while (<HTMLSRC>) {
     507        print HTML $_;
     508        }
     509        close HTMLSRC;
     510        close HTML;
     511
     512        &util::rm("${output_filestem}_ToC.html");
     513        &util::rm("${output_filestem}.src");
     514    }
     515    # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
     516    return 1; # success
     517    }
     518
     519    if (-e "$output_filestem.err") {
     520    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
     521    {
     522        print FAILLOG "Error - rtftohtml - couldn't extract text\n";
     523        print FAILLOG " (rtf file might be too recent):\n";
     524        open (ERRLOG, "$output_filestem.err");
     525        while (<ERRLOG>) {print FAILLOG $_;}
     526        close ERRLOG;
     527        close FAILLOG;
     528    }
     529    &util::rm("$output_filestem.err");
    408530    }
    409531
     
    417539
    418540sub pdf_to_html {
    419     ($dirname, $input_filename, $output_filestem) = @_;
     541    my ($dirname, $input_filename, $output_filestem) = @_;
    420542
    421543    $cmd = "";
    422544    if ($timeout) {$cmd = "ulimit -t $timeout;";}
    423     $cmd .= "perl -S pdftohtml.pl -F ";
     545    $cmd .= "perl -S pdftohtml.pl ";
    424546    $cmd .= " \"$input_filename\" \"$output_filestem\"";
     547   
     548    if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {
     549    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
     550    } else {
     551    $cmd .= " > \"$output_filestem.err\"";
     552    }
     553
    425554    $!=0;
    426555
     
    428557    if ($retval!=0)
    429558    {
    430     print STDERR "Error executing $cmd";
     559    print STDERR "Error executing pdftohtml.pl";
    431560    if ($!) {print STDERR ": $!";}
    432561    print STDERR "\n";
     
    440569    if (-s "$output_filestem.err") {
    441570        open (ERRLOG, "$output_filestem.err") || die "$!";
    442         print STDERR "pdftohtml:\n";
     571        print STDERR "pdftohtml error log:\n";
    443572        while (<ERRLOG>) {
    444573        print STDERR "$_";
     
    447576    }
    448577    &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
    449     &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
     578    if (-e "$output_filestem.err") {
     579        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
     580        {
     581        open (ERRLOG, "$output_filestem.err");
     582        while (<ERRLOG>) {print FAILLOG $_;}
     583        close ERRLOG;
     584        close FAILLOG;
     585        }   
     586    &util::rm("$output_filestem.err");
     587    }
    450588    return 0;
    451589    }
     
    459597
    460598sub pdf_to_text {
    461     ($dirname, $input_filename, $output_filestem) = @_;
     599    my ($dirname, $input_filename, $output_filestem) = @_;
    462600
    463601    my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
    464     $cmd .= " 2> \"$output_filestem.err\"";
     602
     603    if ($ENV{'GSDLOS'} !~ /^windows$/i) {
     604    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
     605    } else {
     606    $cmd .= " > \"$output_filestem.err\"";
     607    }
    465608   
    466609    if (system($cmd)!=0)
     
    468611    print STDERR "Error executing $cmd: $!\n";
    469612    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
    470     &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
    471     return 0;
     613    }
     614
     615    # make sure there is some extracted text.
     616    if (-e "$output_filestem.text") {
     617    open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
     618    binmode(EXTR_TEXT); # just in case...
     619    my $line="";
     620    my $seen_text=0;
     621    while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
     622        if ($line=~ /\w/) {$seen_text=1;}
     623    }
     624    close EXTR_TEXT;
     625    if ($seen_text==0) { # no text was extracted
     626        print STDERR "Error: pdftotext found no text\n";
     627        &util::rm("$output_filestem.text");
     628    }
    472629    }
    473630
     
    478635    if (-s "$output_filestem.err") {
    479636        open (ERRLOG, "$output_filestem.err") || die "$!";
    480         print STDERR "pdftotext:\n";
     637        print STDERR "pdftotext error log:\n";
    481638        while (<ERRLOG>) {
    482639        print STDERR "$_";
     
    487644    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
    488645    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
    489     &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
    490 
     646    if (-e "$output_filestem.err") {
     647        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
     648        {
     649        open (ERRLOG,"$output_filestem.err");
     650        while (<ERRLOG>) {print FAILLOG $_;}
     651        close ERRLOG;
     652        close FAILLOG;
     653        }
     654        &util::rm("$output_filestem.err");
     655    }
    491656    return 0;
    492657    }
     
    537702    if ($error ne "")
    538703    {
    539     print STDERR "PSPlug: WARNING: Error executing gs: $error\n";
     704    print STDERR "Warning: Error executing gs: $error\n";
    540705    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
     706
     707    if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
     708    {
     709        print FAILLOG "gs - $error\n";
     710        if (-e "$output_filestem.err") {
     711        open(ERRLOG, "$output_filestem.err");
     712        while (<ERRLOG>) {print FAILLOG $_;}
     713        close ERRLOG;
     714        }
     715        close FAILLOG;
     716    }
    541717    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
     718
    542719
    543720    # Fine then. We'll just do a lousy job by ourselves...
     
    545722    # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
    546723    #
    547     print STDERR "PSPlug: Stripping text from postscript\n";
     724    print STDERR "Stripping text from postscript\n";
    548725    my $errorcode=0;
    549726    open (IN, "$input_filename")
     
    554731   
    555732    my $text="";  # this is for whole .ps file...
    556     while (<IN>) {
    557         $text.=$_;
    558     }
     733    $text = join('', <IN>); # see man perlport, under "System Resources"
    559734    close IN;
    560735
    561736    # Make sure this is a ps file...
    562737    if ($text !~ /^%!/) {
    563         print STDERR "Bad postscript header: not %!\n";
     738        print STDERR "Bad postscript header: not '%!'\n";
     739        if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
     740        {
     741        print FAILLOG "Bad postscript header: not '%!'\n";
     742        close FAILLOG;
     743        }
    564744        return 0;
    565745    }
     
    666846    print HTML "</head><body>\n\n";
    667847
    668     while (<TEXT>) {
    669     print HTML "<p> ", $_;
     848    my $line;
     849    while ($line=<TEXT>) {
     850    $line =~ s/</&lt;/g;
     851    $line =~ s/>/&gt;/g;
     852    if ($line =~ /^\s*$/) {
     853        print HTML "<p>";
     854    } else {
     855        print HTML "<br> ", $line;
     856    }
    670857    }
    671858    print HTML "\n</body></html>\n";
     
    680867# Convert any file to TEXT with a crude perl implementation of the
    681868# UNIX strings command.
     869# Note - this assumes ascii charsets :(     (jrm21)
    682870
    683871sub any_to_text {
    684872    ($input_filename, $output_filestem) = @_;
    685873
    686     open(IN, "<$input_filename");
     874    open(IN, "<$input_filename") || return 0;
    687875    binmode(IN);
    688     open(OUT, ">$output_filestem.text");
     876    open(OUT, ">$output_filestem.text") || return 0;
    689877
    690878    my ($line);
    691     my $dgcount = 0;
     879    my $output_line_count = 0;
    692880    while (<IN>) {
    693881    $line = $_;
     
    710898    if ($line =~ /[^\n ]/) {
    711899        print OUT $line;
     900        ++$output_line_count;
    712901    }
    713902    }
     
    716905    close IN;
    717906
    718     return 1;
    719 }
     907    if ($output_line_count) { # try to protect against binary only formats
     908    return 1;
     909    }
     910
     911    &util::rm("$output_filestem.text");
     912    return 0;
     913
     914}
Note: See TracChangeset for help on using the changeset viewer.