root/main/trunk/greenstone2/bin/script/gsConvert.pl @ 30683

Revision 30683, 36.6 KB (checked in by ak19, 3 years ago)

Undoing accidental commits of rev 30681.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs.  The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use strict;
52
53use parsargv;
54use util;
55use FileUtils;
56use Cwd;
57
58# Are we running on WinNT or Win2000 (or later)?
59my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62my $use_strings;
63my $pdf_complex;
64my $pdf_nohidden;
65my $pdf_zoom;
66my $pdf_ignore_images;
67my $pdf_allow_images_only;
68my $windows_scripting;
69
70sub print_usage
71{
72    print STDERR "\n";
73    print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
74    print STDERR "              or text using third-party programs.\n\n";
75    print STDERR "  usage: $0 [options] filename\n";
76    print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
77    print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
78    print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
79    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
80    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
81    print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
82    print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
83    print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
84    print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
85    print STDERR "\t\tconverting PDF to HTML\n";
86    print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
87    print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
88    print STDERR "\t\t-pdf_complex is set\n";
89    exit(1);
90}
91
92my $faillogfile="";
93my $timeout=0;
94my $verbosity=0;
95
96sub main
97{
98    my (@ARGV) = @_;
99    my ($input_type,$output_type,$verbose);
100
101    # Dynamically figure out what the --type option can support, based on whether -windows_scripting
102    # is in use or not
103    my $default_type_re = "(doc|dot|pdf|ps|ppt|rtf|xls)";
104    #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
105    #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
106    # Currently only have VBA for Word and PPT(but no XLS)
107    my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xls)";
108
109    my $type_re = $default_type_re;
110   
111    foreach my $a (@ARGV) {
112        if ($a =~ m/^windows_scripting$/i) {
113            $type_re = $enhanced_type_re;
114        }
115    }
116   
117    # read command-line arguments
118    if (!parsargv::parse(\@ARGV,
119             "type/$type_re/", \$input_type,
120             '/errlog/.*/', \$faillogfile,
121             'output/(auto|html|text|pagedimg).*/', \$output_type,
122             'timeout/\d+/0',\$timeout,
123             'verbose/\d+/0', \$verbose,
124             'windows_scripting',\$windows_scripting,
125             'use_strings', \$use_strings,
126             'pdf_complex', \$pdf_complex,
127             'pdf_ignore_images', \$pdf_ignore_images,
128             'pdf_allow_images_only', \$pdf_allow_images_only,
129             'pdf_nohidden', \$pdf_nohidden,
130             'pdf_zoom/\d+/2', \$pdf_zoom
131             ))
132    {
133    print_usage();
134    }
135
136    $verbosity=$verbose if defined $verbose;
137     
138    # Make sure the input file exists and can be opened for reading
139    if (scalar(@ARGV!=1)) {
140    print_usage();
141    }
142
143    my $input_filename = $ARGV[0];
144    if (!-r $input_filename) {
145    print STDERR "Error: unable to open $input_filename for reading\n";
146    exit(1);
147    }
148
149    # Deduce filenames
150    my ($tailname,$dirname,$suffix)
151    = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
152    my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
153
154    if ($input_type eq "")
155    {
156    $input_type = lc (substr($suffix,1,length($suffix)-1));
157    }
158   
159    # Change to temporary working directory
160    my $stored_dir = cwd();
161    chdir ($dirname) || die "Unable to change to directory $dirname";
162
163    # Select convert utility
164    if (!defined $input_type) {
165    print STDERR "Error: No filename extension or input type defined\n";
166    exit(1);
167    }
168    elsif ($input_type =~ m/^docx?$/ || $input_type eq "dot") {
169    print &convertDOC($input_filename, $output_filestem, $output_type);
170    print "\n";
171    }
172    elsif ($input_type eq "rtf") {
173    print &convertRTF($input_filename, $output_filestem, $output_type);
174    print "\n";
175    }
176    elsif ($input_type eq "pdf") {
177    print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
178    print "\n";
179    }
180    elsif ($input_type eq "ps") {
181    print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
182    print "\n";
183    }
184    elsif ($input_type =~ m/pptx?$/) {
185    print &convertPPT($input_filename, $output_filestem, $output_type);
186    print "\n";
187    }
188    elsif ($input_type =~ m/xlsx?$/) {
189    print &convertXLS($input_filename, $output_filestem, $output_type);
190    print "\n";
191    }
192    else {
193    print STDERR "Error: Unable to convert type '$input_type'\n";
194    exit(1);
195    }
196   
197    # restore to original working directory
198    chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
199
200}
201
202&main(@ARGV);
203
204
205
206# Document-type conversion functions
207#
208# The following functions attempt to convert documents from their
209# input type to the specified output type.  If no output type was
210# given, then they first attempt HTML, and then TEXT.
211#
212# Each returns the output type ("html" or "text") or "fail" if no
213# conversion is possible.
214
215# Convert a Microsoft word document
216
217sub convertDOC {
218    my ($input_filename, $output_filestem, $output_type) = @_;
219
220    # Many .doc files are not in fact word documents!
221    my $realtype = &find_docfile_type($input_filename);
222
223    if ($realtype eq "word6" || $realtype eq "word7"
224        || $realtype eq "word8" || $realtype eq "docx") {
225    return &convertWord678($input_filename, $output_filestem, $output_type);
226    } elsif ($realtype eq "rtf") {
227    return &convertRTF($input_filename, $output_filestem, $output_type);
228    } else {
229    return &convertAnything($input_filename, $output_filestem, $output_type);
230    }
231}
232
233# Convert a Microsoft word 6/7/8 document
234
235sub convertWord678 {
236    my ($input_filename, $output_filestem, $output_type) = @_;
237
238    my $success = 0;
239    if (!$output_type || ($output_type =~ m/html/i)){
240    if ($windows_scripting) {
241        $success = &native_doc_to_html($input_filename, $output_filestem);
242    }
243    else {
244        $success = &doc_to_html($input_filename, $output_filestem);   
245    }
246    if ($success) {
247       return "html";
248    }
249    }
250    return &convertAnything($input_filename, $output_filestem, $output_type);
251}
252
253
254# Convert a Rich Text Format (RTF) file
255
256sub convertRTF {
257    my ($input_filename, $output_filestem, $output_type) = @_;
258
259    my $success = 0;
260
261    # Attempt specialised conversion to HTML
262    if (!$output_type || ($output_type =~ m/html/i)) {
263
264    if ($windows_scripting) {
265        $success = &native_doc_to_html($input_filename, $output_filestem);
266    }
267    else {
268        $success = &rtf_to_html($input_filename, $output_filestem);
269    }
270    if ($success) {
271        return "html";
272    }
273    }
274
275# rtf is so ugly that's it's not worth running strings over.
276# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
277#    return &convertAnything($input_filename, $output_filestem, $output_type);
278    return "fail";
279}
280
281
282# Convert an unidentified file
283
284sub convertAnything {
285    my ($input_filename, $output_filestem, $output_type) = @_;
286   
287    my $success = 0;
288 
289    # Attempt simple conversion to HTML
290    if (!$output_type || ($output_type =~ m/html/i)) {
291    $success = &any_to_html($input_filename, $output_filestem);
292    if ($success) {
293        return "html";
294    }
295    }
296
297    # Convert to text
298    if (!$output_type || ($output_type =~ m/text/i)) {
299    $success = &any_to_text($input_filename, $output_filestem);
300    if ($success) {
301        return "text";
302    }
303    }
304    return "fail";
305}
306
307
308
309# Convert an Adobe PDF document
310
311sub convertPDF {
312    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
313
314    my $success = 0;
315    $output_type =~ s/.*\-(.*)/$1/i;
316    # Attempt coversion to Image
317    if ($output_type =~ m/jp?g|gif|png/i) {
318    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
319    if ($success){
320        return "item";
321    }
322    }
323
324    # Attempt conversion to HTML
325    if (!$output_type || ($output_type =~ m/html/i)) {
326    $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
327    if ($success) {
328        return "html";
329    }
330    }
331
332    # Attempt conversion to TEXT
333    if (!$output_type || ($output_type =~ m/text/i)) {
334    $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
335    if ($success) {
336        return "text";
337    }
338    }
339
340    return "fail";
341
342}
343
344
345# Convert an Adobe PostScript document
346
347sub convertPS {
348    my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
349
350    my $success = 0;
351    $output_type =~ s/.*\-(.*)/$1/i;
352    # Attempt coversion to Image
353    if ($output_type =~ m/jp?g|gif|png/i) {
354    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
355    if ($success){
356        return "item";
357    }
358    }
359
360    # Attempt conversion to TEXT
361    if (!$output_type || ($output_type =~ m/text/i)) {
362    $success = &ps_to_text($input_filename, $output_filestem);
363    if ($success) {
364        return "text";
365    }
366    }
367    return "fail";
368}
369
370
371sub convertPPT {
372    my ($input_filename, $output_filestem, $output_type) = @_;
373    my $success = 0;
374
375    my $ppt_convert_type = "";
376
377    #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
378    if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
379    if ($output_type =~ m/gif/i) {
380        $ppt_convert_type = "-g";
381    } elsif ($output_type =~ m/jp?g/i){
382        $ppt_convert_type = "-j";
383    } elsif ($output_type =~ m/png/i){
384        $ppt_convert_type = "-p";
385    }
386    my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
387                       $ENV{'GSDLOS'}, "pptextract");
388    $vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
389    # $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
390           
391    my $cmd = "";
392    if ($timeout) {$cmd = "ulimit -t $timeout;";}
393    # if the converting directory already exists
394    if (-d $output_filestem) {
395        print STDERR "**The conversion directory already exists\n";
396        return "item";
397    } else {
398        $cmd .=  "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
399        $cmd .= " 2>\"$output_filestem.err\""
400        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
401
402        if (system($cmd) !=0) {
403        print STDERR "Powerpoint VB Scripting convert failed\n";
404        } else {
405        return "item";
406        }
407    }
408    } elsif (!$output_type || ($output_type =~ m/html/i)) {
409    # Attempt conversion to HTML
410    #if (!$output_type || ($output_type =~ m/html/i)) {
411    # formulate the command
412    my $cmd = "";
413    my $full_perl_path = &util::get_perl_exec();
414    $cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
415    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
416    $cmd .= " 2>\"$output_filestem.err\""
417        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
418
419    # execute the command
420    $!=0;
421    if (system($cmd)!=0)
422    {
423        print STDERR "Powerpoint 95/97 converter failed $!\n";
424    } else {
425        return "html";
426    }
427    }
428
429    $success = &any_to_text($input_filename, $output_filestem);
430    if ($success) {
431    return "text";
432    }
433   
434    return "fail";
435}
436
437
438sub convertXLS {
439    my ($input_filename, $output_filestem, $output_type) = @_;
440
441    my $success = 0;
442
443    # Attempt conversion to HTML
444    if (!$output_type || ($output_type =~ m/html/i)) {
445    # formulate the command
446    my $cmd = "";
447    my $full_perl_path = &util::get_perl_exec();
448    $cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
449    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
450    $cmd .= " 2>\"$output_filestem.err\""
451        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
452   
453   
454    # execute the command
455    $!=0;
456    if (system($cmd)!=0)
457    {
458        print STDERR "Excel 95/97 converter failed $!\n";
459    } else {
460        return "html";
461    }
462    }
463
464    $success = &any_to_text($input_filename, $output_filestem);
465    if ($success) {
466    return "text";
467    }
468
469    return "fail";
470}
471
472
473
474# Find the real type of a .doc file
475#
476# We seem to have a lot of files with a .doc extension that are .rtf
477# files or Word 5 files.  This function attempts to tell the difference.
478sub find_docfile_type {
479    my ($input_filename) = @_;
480   
481    if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
482        return "docx";
483    }
484   
485    open(CHK, "<$input_filename");
486    binmode(CHK);
487    my $line = "";
488    my $first = 1;
489
490    while (<CHK>) {
491   
492    $line = $_;
493
494    if ($first) {
495        # check to see if this is an rtf file
496        if ($line =~ m/^\{\\rtf/) {
497        close(CHK);
498        return "rtf";
499        }
500        $first = 0;
501    }
502   
503    # is this is a word 6/7/8 document?
504    if ($line =~ m/Word\.Document\.([678])/) {
505        close(CHK);
506
507        return "word$1";
508    }
509
510    }
511
512    return "unknown";
513}
514
515
516# Specific type-to-type conversions
517#
518# Each of the following functions attempts to convert a document from
519# a specific format to another.  If they succeed they return 1 and leave
520# the output document(s) in the appropriate place; if they fail they
521# return 0 and delete any working files.
522
523
524# Attempt to convert a word document to html with the wv program
525sub doc_to_html {
526    my ($input_filename, $output_filestem) = @_;
527
528    my $wvware_status = 0;
529   
530    # need to ensure that the path to perl is quoted (in case there's spaces in it)
531    my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";   
532
533#    print STDERR "***** wvware launch cmd = $launch_cmd\n";
534
535    $wvware_status = system($launch_cmd)/256;
536    return $wvware_status;
537}
538
539# Attempt to convert a word document to html with the word2html scripting program
540sub native_doc_to_html {
541    my ($input_filename, $output_filestem) = @_;
542
543    # build up the path to the doc-to-html conversion tool we're going to use
544    my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
545
546    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
547        # if windows scripting with docx input, use new VBscript to get the local Word install (if
548        # any) to do the conversion, since docX can't be processed by word2html's windows_scripting
549       
550        if($input_filename =~ m/docx$/i) {  # need to use full path to docx2html script,
551                                            # else script launch fails when there are error msgs
552            $vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
553            $vbScript = "CScript //Nologo \"$vbScript\"";   # launch with CScript for error output in STDERR
554                                    # //Nologo flag avoids Microsoft's opening/logo msgs
555            print STDERR "About to use windows scripting to process docx file $input_filename.\n";
556            print STDERR "   This may take some time. Please wait...\n";
557        }
558        else {  # old doc versions. use the usual VB executable word2html for the
559                # conversion. Doesn't need full path, since bin\windows is on PATH         
560            $vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
561        }
562    }
563    else { # not windows
564        $vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
565    }
566
567    if (-e "$output_filestem.html") {
568    print STDERR "    The conversion file:\n";
569    print STDERR "      $output_filestem.html\n";
570    print STDERR "    ... already exists.  Skipping\n";
571    return 1;
572    }
573
574    my $cmd = "";
575    if ($timeout) {$cmd = "ulimit -t $timeout;";}
576    #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
577    #$cmd .=  "$vbScript $input_filename $output_filestem.html";
578    $cmd .=  "$vbScript \"$input_filename\" \"$output_filestem.html\"";
579
580    # redirecting STDERR
581   
582    $cmd .= " 2> \"$output_filestem.err\""
583        if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);   
584    #print STDERR "@@@@@@@@@ cmd=$cmd\n";
585   
586    # execute the command
587    $!=0;
588    if (system($cmd)!=0)
589    {
590    print STDERR "Error executing $vbScript converter:$!\n";
591    if (-s "$output_filestem.err") {
592        open (ERRFILE, "<$output_filestem.err");
593       
594        my $write_to_fail_log=0;
595        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
596        {$write_to_fail_log=1;}
597
598        my $line;
599        while ($line=<ERRFILE>) {
600        if ($line =~ m/\w/) {
601            print STDERR "$line";
602            print FAILLOG "$line" if ($write_to_fail_log);
603        }
604        if ($line !~ m/startup error/) {next;}
605        print STDERR " (given an invalid .DOC file?)\n";
606        print FAILLOG " (given an invalid .DOC file?)\n"
607        if ($write_to_fail_log);
608       
609        } # while ERRFILE
610        close FAILLOG if ($write_to_fail_log);
611    }
612    return 0; # we can try any_to_text
613    }
614
615    # Was the conversion successful?
616    if (-s "$output_filestem.html") {
617    open(TMP, "$output_filestem.html");
618    my $line = <TMP>;
619    close(TMP);
620    if ($line && $line =~ m/html/i) {
621        &FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
622        return 1;
623    }
624    }
625   
626    # If here, an error of some sort occurred
627    &FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
628    if (-e "$output_filestem.err") {
629    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
630        open (ERRLOG,"$output_filestem.err");
631        while (<ERRLOG>) {print FAILLOG $_;}
632        close FAILLOG;
633        close ERRLOG;
634    }
635    &FileUtils::removeFiles("$output_filestem.err");
636    }
637    return 0;
638}
639
640# Attempt to convert an RTF document to html with rtftohtml
641sub rtf_to_html {
642    my ($input_filename, $output_filestem) = @_;
643
644    # formulate the command
645    my $cmd = "";
646    if ($timeout) {$cmd = "ulimit -t $timeout;";}
647    $cmd .= "rtftohtml";
648    #$cmd .= "rtf-converter";
649
650    $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
651
652    $cmd .= " 2>\"$output_filestem.err\""
653        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
654
655
656    # execute the command
657    $!=0;
658    if (system($cmd)!=0)
659    {
660    print STDERR "Error executing rtf converter $!\n";
661    # don't currently bother printing out error log...
662    # keep going, in case it still created an HTML file...
663    }
664
665    # Was the conversion successful?
666    my $was_successful=0;
667    if (-s "$output_filestem.html") {
668    # make sure we have some content other than header
669    open (HTML, "$output_filestem.html"); # what to do if fail?
670    my $line;
671    my $past_header=0;
672    while ($line=<HTML>) {
673
674        if ($past_header == 0) {
675        if ($line =~ m/<body>/) {$past_header=1;}
676        next;
677        }
678
679        $line =~ s/<[^>]+>//g;
680        if ($line =~ m/\w/ && $past_header) {  # we found some content...
681        $was_successful=1;
682        last;
683        }
684    }
685    close HTML;
686    }
687
688    if ($was_successful) {
689    &FileUtils::removeFiles("$output_filestem.err")
690        if (-e "$output_filestem.err");
691    # insert the (modified) table of contents, if it exists.
692    if (-e "${output_filestem}_ToC.html") {
693        &FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
694        my $open_failed=0;
695        open HTMLSRC, "$output_filestem.src" || ++$open_failed;
696        open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
697        open HTML, ">$output_filestem.html" || ++$open_failed;
698       
699        if ($open_failed) {
700        close HTMLSRC;
701        close TOC;
702        close HTML;
703        &FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
704        return 1;
705        }
706
707        # print out header info from src html.
708        while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
709        print HTML "$_";
710        }
711
712        # print out table of contents, making links relative
713        <TOC>; <TOC>; # ignore first 2 lines
714        print HTML scalar(<TOC>); # line 3 = "<ol>\n"
715        my $line;
716        while ($line=<TOC>) {
717        $line =~ s@</body></html>$@@i ; # only last line has this
718        # make link relative
719        $line =~ s@href=\"[^\#]+@href=\"@i;
720        print HTML $line;
721        }
722        close TOC;
723
724        # rest of html src
725        while (<HTMLSRC>) {
726        print HTML $_;
727        }
728        close HTMLSRC;
729        close HTML;
730
731        &FileUtils::removeFiles("${output_filestem}_ToC.html");
732        &FileUtils::removeFiles("${output_filestem}.src");
733    }
734    # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
735    return 1; # success
736    }
737
738    if (-e "$output_filestem.err") {
739    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
740    {
741        print FAILLOG "Error - rtftohtml - couldn't extract text\n";
742        #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
743        print FAILLOG " (rtf file might be too recent):\n";
744        open (ERRLOG, "$output_filestem.err");
745        while (<ERRLOG>) {print FAILLOG $_;}
746        close ERRLOG;
747        close FAILLOG;
748    }
749    &FileUtils::removeFiles("$output_filestem.err");
750    }
751
752    &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
753
754    return 0;
755}
756
757
758# Convert a pdf file to html with the pdftohtml command
759
760sub pdf_to_html {
761    my ($dirname, $input_filename, $output_filestem) = @_;
762
763    my $cmd = "";
764    if ($timeout) {$cmd = "ulimit -t $timeout;";}
765    my $full_perl_path = &util::get_perl_exec();
766    $cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
767    $cmd .= " -c" if ($pdf_complex);
768    $cmd .= " -i" if ($pdf_ignore_images);
769    $cmd .= " -a" if ($pdf_allow_images_only);
770    $cmd .= " -hidden" unless ($pdf_nohidden);
771    $cmd .= " \"$input_filename\" \"$output_filestem\"";
772   
773    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
774    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
775    } else {
776    $cmd .= " > \"$output_filestem.err\"";
777    }
778
779    $!=0;
780
781    my $retval=system($cmd);
782    if ($retval!=0)
783    {
784    print STDERR "Error executing pdftohtml.pl";
785    if ($!) {print STDERR ": $!";}
786    print STDERR "\n";
787    }
788
789    # make sure the converter made something
790    if ($retval!=0 || ! -s "$output_filestem.html")
791    {
792    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
793    # print out the converter's std err, if any
794    if (-s "$output_filestem.err") {
795        open (ERRLOG, "$output_filestem.err") || die "$!";
796        print STDERR "pdftohtml error log:\n";
797        while (<ERRLOG>) {
798        print STDERR "$_";
799        }
800        close ERRLOG;
801    }
802    #print STDERR "***********output filestem $output_filestem.html\n";
803    &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
804    if (-e "$output_filestem.err") {
805        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
806        {
807        open (ERRLOG, "$output_filestem.err");
808        while (<ERRLOG>) {print FAILLOG $_;}
809        close ERRLOG;
810        close FAILLOG;
811        }   
812        &FileUtils::removeFiles("$output_filestem.err");
813    }
814    return 0;
815    }
816
817    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
818    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
819    return 1;
820}
821
822# Convert a pdf file to various types of image with the convert command
823
824sub pdfps_to_img {
825    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
826
827    # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
828    if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
829    my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
830    $imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
831    my $result = `$imagick_cmd identify 2>&1`;
832
833    # Linux and Windows return different values for "program not found".
834    # Linux returns -1 and Windows 256 for "program not found". But once they're
835    # converted to signed values, it will be -1 for Linux and 1 for Windows.
836    # Whenever we test for return values other than 0, shift by 8 and perform
837    # unsigned to signed status conversion on $? to get expected range of return vals
838    # Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
839    # and then exits on that, by the time we get here, we need to do it again
840    my $status = $?;
841    $status >>= 8;
842    $status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);   
843    if (($ENV{'GSDLOS'} ne "windows" && $status == -1) || ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
844        # if ($status == -1 || $status == 1) #if ($status == -1 || $status == 256) {
845        #ImageMagick is not installed, thus the convert utility is not available.
846        print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
847        return 0;
848    }
849    }
850
851    my $cmd = "";
852    if ($timeout) {$cmd = "ulimit -t $timeout;";}
853    $output_type =~ s/.*\_(.*)/$1/i;
854    my $full_perl_path = &util::get_perl_exec();
855    $cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
856    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
857    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
858    } else {
859    $cmd .= " > \"$output_filestem.err\"";
860    }
861
862    # don't include path on windows (to avoid having to play about
863    # with quoting when GSDLHOME might contain spaces) but assume
864    # that the PATH is set up correctly
865    $!=0;
866    my $retval=system($cmd);
867    if ($retval!=0)
868    {
869    print STDERR "Error executing pdfpstoimg.pl";
870    if ($!) {print STDERR ": $!";}
871    print STDERR "\n";
872    }
873
874    #make sure the converter made something
875    #if ($retval !=0) || ! -s "$output_filestem")
876    if ($retval !=0)
877    {
878    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
879    #print out the converter's std err, if any
880    if (-s "$output_filestem.err") {
881        open (ERRLOG, "$output_filestem.err") || die "$!";
882        print STDERR "pdfpstoimg error log:\n";
883        while (<ERRLOG>) {
884        print STDERR "$_";
885        }
886        close ERRLOG;
887    }
888    #&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
889    if (-e "$output_filestem.err") {
890        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
891        {
892        open (ERRLOG, "$output_filestem.err");
893        while (<ERRLOG>) {print FAILLOG $_;}
894        close ERRLOG;
895        close FAILLOG;
896       }   
897        &FileUtils::removeFiles("$output_filestem.err");
898    }
899    return 0;
900    }
901    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
902    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
903    return 1;
904}
905
906# Convert a PDF file to text with the pdftotext command
907
908sub pdf_to_text {
909    my ($dirname, $input_filename, $output_filestem) = @_;
910
911    my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
912
913    if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
914    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
915    } else {
916    $cmd .= " > \"$output_filestem.err\"";
917    }
918   
919    if (system($cmd)!=0)
920    {
921    print STDERR "Error executing $cmd: $!\n";
922    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
923    }
924
925    # make sure there is some extracted text.
926    if (-e "$output_filestem.text") {
927    open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
928    binmode(EXTR_TEXT); # just in case...
929    my $line="";
930    my $seen_text=0;
931    while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
932        if ($line=~ m/\w/) {$seen_text=1;}
933    }
934    close EXTR_TEXT;
935    if ($seen_text==0) { # no text was extracted
936        print STDERR "Error: pdftotext found no text\n";
937        &FileUtils::removeFiles("$output_filestem.text");
938    }
939    }
940
941    # make sure the converter made something
942    if (! -s "$output_filestem.text")
943    {
944    # print out the converters std err, if any
945    if (-s "$output_filestem.err") {
946        open (ERRLOG, "$output_filestem.err") || die "$!";
947        print STDERR "pdftotext error log:\n";
948        while (<ERRLOG>) {
949        print STDERR "$_";
950        }
951        close ERRLOG;
952    }
953    # does this converter create a .out file?
954    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
955    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
956    if (-e "$output_filestem.err") {
957        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
958        {
959        open (ERRLOG,"$output_filestem.err");
960        while (<ERRLOG>) {print FAILLOG $_;}
961        close ERRLOG;
962        close FAILLOG;
963        }
964        &FileUtils::removeFiles("$output_filestem.err");
965    }
966    return 0;
967    }
968    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
969    return 1;
970}
971
972# Convert a PostScript document to text
973# note - just using "ps2ascii" isn't good enough, as it
974# returns 0 for a postscript interpreter error. ps2ascii is just
975# a wrapper to "gs" anyway, so we use that cmd here.
976
977sub ps_to_text {
978    my ($input_filename, $output_filestem) = @_;
979
980    my $error = "";
981
982    # if we're on windows we'll fall straight through without attempting
983    # to use gs
984    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
985    $error = "Windows does not support gs";
986
987    } else {
988    my $cmd = "";
989    if ($timeout) {$cmd = "ulimit -t $timeout; ";}
990    $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
991    $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
992    #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
993    $cmd .= " 2> $output_filestem.err";
994    $!=0;
995
996    my $retcode=system($cmd);
997    $retcode = $? >> 8;  # see man perlfunc - system for this...
998    # if system returns -1 | 127 (couldn't start program), look at $! for message
999
1000    if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1001    elsif (! -e "$output_filestem.text") {
1002        $error="did not create output file.\n";
1003    }
1004    else
1005    {   # make sure the interpreter didn't get an error. It is technically
1006        # possible for the actual text to start with this, but....
1007        open PSOUT, "$output_filestem.text";
1008        if (<PSOUT> =~ m/^Error: (.*)/) {
1009        $error="interpreter error - \"$1\"";
1010        }
1011        close PSOUT;
1012    }
1013    }
1014
1015    if ($error ne "")
1016    {
1017    print STDERR "Warning: Error executing gs: $error\n";
1018    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1019
1020    if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1021    {
1022        print FAILLOG "gs - $error\n";
1023        if (-e "$output_filestem.err") {
1024        open(ERRLOG, "$output_filestem.err");
1025        while (<ERRLOG>) {print FAILLOG $_;}
1026        close ERRLOG;
1027        }
1028        close FAILLOG;
1029    }
1030    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1031
1032
1033    # Fine then. We'll just do a lousy job by ourselves...
1034    # Based on 5-line regexp sed script found at:
1035    # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1036    #
1037    print STDERR "Stripping text from postscript\n";
1038    my $errorcode=0;
1039    open (IN, "$input_filename")
1040        ||  ($errorcode=1, warn "Couldn't read file: $!");
1041    open (OUT, ">$output_filestem.text")
1042        ||  ($errorcode=1, warn "Couldn't write file: $!");
1043    if ($errorcode) {print STDERR "errors\n";return 0;}
1044   
1045    my $text="";  # this is for whole .ps file...
1046    $text = join('', <IN>); # see man perlport, under "System Resources"
1047    close IN;
1048
1049    # Make sure this is a ps file...
1050    if ($text !~ m/^%!/) {
1051        print STDERR "Bad postscript header: not '%!'\n";
1052        if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1053        {
1054        print FAILLOG "Bad postscript header: not '%!'\n";
1055        close FAILLOG;
1056        }
1057        return 0;
1058    }
1059
1060    # if ps has Page data, then use it to delete all stuff before it.
1061    $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1062   
1063    # remove all leading non-data stuff
1064    $text =~ s/^.*?\(//s;
1065
1066    # remove all newline chars for easier processing
1067    $text =~ s/\n//g;
1068   
1069    # Big assumption here - assume that if any co-ordinates are
1070    # given, then we are at the end of a sentence.
1071    $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1072
1073    # special characters--
1074    $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1075
1076    # ? ps text formatting (eg italics?) ?
1077    $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1078    $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1079    $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1080    # default - remove the rest
1081    $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1082
1083    # attempt to add whitespace between words...
1084    # this is based purely on observation, and may be completely wrong...
1085    $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1086    # eg I notice "b(" is sometimes NOT a space if preceded by a
1087    # negative number.
1088    $text =~ s/\)\d+ ?b\(/\) \( /g;
1089
1090    # change quoted braces to brackets
1091    $text =~ s/([^\\])\\\(/$1\{/g;
1092    $text =~ s/([^\\])\\\)/$1\}/g ;
1093
1094    # remove everything that is not between braces
1095    $text =~ s/\)([^\(\)])+?\(//sg ;
1096   
1097    # remove any Trailer eof stuff.
1098    $text =~ s/\)[^\)]*$//sg;
1099
1100    ### ligatures have special characters...
1101    $text =~ s/\\013/ff/g;
1102    $text =~ s/\\014/fi/g;
1103    $text =~ s/\\015/fl/g;
1104    $text =~ s/\\016/ffi/g;
1105    $text =~ s/\\214/fi/g;
1106    $text =~ s/\\215/fl/g;
1107    $text =~ s/\\017/\n\* /g; # asterisk?
1108    $text =~ s/\\023/\023/g;  # e acute ('e)
1109    $text =~ s/\\177/\252/g;  # u"
1110#   $text =~ s/ ?? /\344/g;  # a"
1111
1112    print OUT "$text";
1113    close OUT;
1114    }
1115    # wrap the text - use a minimum length. ie, first space after this length.
1116    my $wrap_length=72;
1117    &FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1118    open INFILE, "$output_filestem.text.tmp" ||
1119    die "Couldn't open file: $!";
1120    open OUTFILE, ">$output_filestem.text" ||
1121    die "Couldn't open file for writing: $!";
1122    my $line="";
1123    while ($line=<INFILE>) {
1124    while (length($line)>0) {
1125        if (length($line)>$wrap_length) {
1126        $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1127        print OUTFILE "$1\n";
1128        } else {
1129        print OUTFILE "$line";
1130        $line="";
1131        }
1132    }
1133    }
1134    close INFILE;
1135    close OUTFILE;
1136    &FileUtils::removeFiles("$output_filestem.text.tmp");
1137
1138    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1139    return 1;
1140}
1141
1142
1143# Convert any file to HTML with a crude perl implementation of the
1144# UNIX strings command.
1145
1146sub any_to_html {
1147    my ($input_filename, $output_filestem) = @_;
1148
1149    # First generate a text file
1150    return 0 unless (&any_to_text($input_filename, $output_filestem));
1151
1152    # create an HTML file from the text file
1153    open(TEXT, "<$output_filestem.text");
1154    open(HTML, ">$output_filestem.html");
1155
1156    print HTML "<html><head>\n";
1157    print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1158    print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1159    print HTML "</head><body>\n\n";
1160
1161    my $line;
1162    while ($line=<TEXT>) {
1163    $line =~ s/</&lt;/g;
1164    $line =~ s/>/&gt;/g;
1165    if ($line =~ m/^\s*$/) {
1166        print HTML "<p>";
1167    } else {
1168        print HTML "<br> ", $line;
1169    }
1170    }
1171    print HTML "\n</body></html>\n";
1172
1173    close HTML;
1174    close TEXT;
1175
1176    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1177    return 1;
1178}
1179
1180# Convert any file to TEXT with a crude perl implementation of the
1181# UNIX strings command.
1182# Note - this assumes ascii charsets :(     (jrm21)
1183
1184sub any_to_text {
1185    my ($input_filename, $output_filestem) = @_;
1186
1187    if (!$use_strings) {
1188      return 0;
1189    }
1190
1191    print STDERR "\n**** In any to text****\n\n";
1192    open(IN, "<$input_filename") || return 0;
1193    binmode(IN);
1194    open(OUT, ">$output_filestem.text") || return 0;
1195
1196    my ($line);
1197    my $output_line_count = 0;
1198    while (<IN>) {
1199    $line = $_;
1200
1201    # delete anything that isn't a printable character
1202    $line =~ s/[^\040-\176]+/\n/sg;
1203
1204    # delete any string less than 10 characters long
1205    $line =~ s/^.{0,9}$/\n/mg;
1206    while ($line =~ m/^.{1,9}$/m) {
1207        $line =~ s/^.{0,9}$/\n/mg;
1208        $line =~ s/\n+/\n/sg;
1209    }
1210
1211    # remove extraneous whitespace
1212    $line =~ s/\n+/\n/gs;
1213    $line =~ s/^\n//gs;
1214
1215    # output whatever is left
1216    if ($line =~ m/[^\n ]/) {
1217        print OUT $line;
1218        ++$output_line_count;
1219    }
1220    }
1221
1222    close OUT;
1223    close IN;
1224
1225    if ($output_line_count) { # try to protect against binary only formats
1226    return 1;
1227    }
1228
1229    &FileUtils::removeFiles("$output_filestem.text");
1230    return 0;
1231
1232}
Note: See TracBrowser for help on using the browser.