source: trunk/gsdl/bin/script/gsConvert.pl@ 4103

Last change on this file since 4103 was 4103, checked in by sjboddie, 21 years ago

Added a -nohidden PDFPlug option and made it pass the -hidden option to pdftohtml
by default.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 27.3 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use parsargv;
52use util;
53use Cwd;
54use File::Basename;
55
56# Are we running on WinNT or Win2000 (or later)?
57my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
58if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
59
60my $use_strings;
61my $pdf_complex;
62my $pdf_nohidden;
63my $pdf_zoom;
64my $pdf_ignore_images;
65
66sub print_usage
67{
68 print STDERR "\n";
69 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
70 print STDERR " or text using third-party programs.\n\n";
71 print STDERR " usage: $0 [options] filename\n";
72 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
73 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
74 print STDERR "\t-output\thtml|text\n";
75 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
76 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
77 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
78 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
79 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
80 print STDERR "\t\tconverting PDF to HTML\n";
81 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
82 print STDERR "\t\t-pdf_complex is set\n";
83 exit(1);
84}
85
86my $faillogfile="";
87my $timeout=0;
88
89sub main
90{
91 my (@ARGV) = @_;
92 my ($input_type,$output_type,$verbose);
93
94 # read command-line arguments
95 if (!parsargv::parse(\@ARGV,
96 'type/(doc|dot|pdf|ps|ppt|rtf|xls)/', \$input_type,
97 '/errlog/.*/', \$faillogfile,
98 'output/(html|text)/', \$output_type,
99 'timeout/\d+/0',\$timeout,
100 'verbose/\d+/0', \$verbose,
101 'use_strings', \$use_strings,
102 'pdf_complex', \$pdf_complex,
103 'pdf_nohidden', \$pdf_nohidden,
104 'pdf_zoom/\d+/2', \$pdf_zoom
105 ))
106 {
107 print_usage();
108 }
109
110 # Make sure the input file exists and can be opened for reading
111 if (scalar(@ARGV!=1)) {
112 print_usage();
113 }
114
115 my $input_filename = $ARGV[0];
116 if (!-r $input_filename) {
117 print STDERR "Error: unable to open $input_filename for reading\n";
118 exit(1);
119 }
120
121 # Deduce filenames
122 my ($tailname,$dirname,$suffix)
123 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
124 my $output_filestem = &util::filename_cat($dirname, "$tailname");
125
126 if ($input_type eq "")
127 {
128 $input_type = lc (substr($suffix,1,length($suffix)-1));
129 }
130
131 # Change to temporary working directory
132 my $stored_dir = cwd();
133 chdir ($dirname) || die "Unable to change to directory $dirname";
134
135 # Select convert utility
136 if (!defined $input_type) {
137 print STDERR "Error: No filename extension or input type defined\n";
138 exit(1);
139 }
140 elsif ($input_type eq "doc" || $input_type eq "dot") {
141 print &convertDOC($input_filename, $output_filestem, $output_type);
142 print "\n";
143 }
144 elsif ($input_type eq "rtf") {
145 print &convertRTF($input_filename, $output_filestem, $output_type);
146 print "\n";
147 }
148 elsif ($input_type eq "pdf") {
149 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
150 print "\n";
151 }
152 elsif ($input_type eq "ps") {
153 print &convertPS($input_filename, $output_filestem, $output_type);
154 print "\n";
155 }
156 elsif ($input_type eq "ppt") {
157 print &convertPPT($input_filename, $output_filestem, $output_type);
158 print "\n";
159 }
160 elsif ($input_type eq "xls") {
161 print &convertXLS($input_filename, $output_filestem, $output_type);
162 print "\n";
163 }
164 else {
165 print STDERR "Error: Unable to convert type '$input_type'\n";
166 exit(1);
167 }
168
169 # restore to original working directory
170 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
171
172}
173
174&main(@ARGV);
175
176
177
178# Document-type conversion functions
179#
180# The following functions attempt to convert documents from their
181# input type to the specified output type. If no output type was
182# given, then they first attempt HTML, and then TEXT.
183#
184# Each returns the output type ("html" or "text") or "fail" if no
185# conversion is possible.
186
187# Convert a Microsoft word document
188
189sub convertDOC {
190 ($input_filename, $output_filestem, $output_type) = @_;
191
192 # Many .doc files are not in fact word documents!
193 my $realtype = &find_docfile_type($input_filename);
194
195 if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
196 return &convertWord678($input_filename, $output_filestem, $output_type);
197 } elsif ($realtype eq "rtf") {
198 return &convertRTF($input_filename, $output_filestem, $output_type);
199 } else {
200 return &convertAnything($input_filename, $output_filestem, $output_type);
201 }
202}
203
204# Convert a Microsoft word 6/7/8 document
205
206sub convertWord678 {
207 ($input_filename, $output_filestem, $output_type) = @_;
208
209 my $success = 0;
210
211 # Attempt specialised conversion to HTML
212 if (!$output_type || ($output_type =~ /html/i)) {
213 $success = &doc_to_html($input_filename, $output_filestem);
214 if ($success) {
215 return "html";
216 }
217 }
218
219 return &convertAnything($input_filename, $output_filestem, $output_type);
220}
221
222
223# Convert a Rich Text Format (RTF) file
224
225sub convertRTF {
226 ($input_filename, $output_filestem, $output_type) = @_;
227
228 my $success = 0;
229
230 # Attempt specialised conversion to HTML
231 if (!$output_type || ($output_type =~ /html/i)) {
232 $success = &rtf_to_html($input_filename, $output_filestem);
233 if ($success) {
234 return "html";
235 }
236 }
237
238# rtf is so ugly that's it's not worth running strings over.
239# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
240# return &convertAnything($input_filename, $output_filestem, $output_type);
241 return "fail";
242}
243
244
245# Convert an unidentified file
246
247sub convertAnything {
248 ($input_filename, $output_filestem, $output_type) = @_;
249
250 my $success = 0;
251
252 # Attempt simple conversion to HTML
253 if (!$output_type || ($output_type =~ /html/i)) {
254 $success = &any_to_html($input_filename, $output_filestem);
255 if ($success) {
256 return "html";
257 }
258 }
259
260 # Convert to text
261 if (!$output_type || ($output_type =~ /text/i)) {
262 $success = &any_to_text($input_filename, $output_filestem);
263 if ($success) {
264 return "text";
265 }
266 }
267 return "fail";
268}
269
270
271
272# Convert an Adobe PDF document
273
274sub convertPDF {
275 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
276
277 my $success = 0;
278
279 # Attempt conversion to HTML
280 if (!$output_type || ($output_type =~ /html/i)) {
281 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
282 if ($success) {
283 return "html";
284 }
285 }
286
287 # Attempt conversion to TEXT
288 if (!$output_type || ($output_type =~ /text/i)) {
289 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
290 if ($success) {
291 return "text";
292 }
293 }
294
295 return "fail";
296
297}
298
299
300# Convert an Adobe PostScript document
301
302sub convertPS {
303 ($input_filename, $output_filestem, $output_type) = @_;
304
305 my $success = 0;
306
307 # Attempt conversion to TEXT
308 if (!$output_type || ($output_type =~ /text/i)) {
309 $success = &ps_to_text($input_filename, $output_filestem);
310 if ($success) {
311 return "text";
312 }
313 }
314
315 return "fail";
316
317}
318
319
320sub convertPPT {
321 my ($input_filename, $output_filestem, $output_type) = @_;
322
323 my $success = 0;
324
325 # Attempt conversion to HTML
326 if (!$output_type || ($output_type =~ /html/i)) {
327 # formulate the command
328 $cmd = "";
329 $cmd .= "perl -S ppttohtml.pl ";
330 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
331 $cmd .= " 2>\"$output_filestem.err\""
332 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
333
334
335 # execute the command
336 $!=0;
337 if (system($cmd)!=0)
338 {
339 print STDERR "Powerpoint 95/97 converter failed $!\n";
340 } else {
341 return "html";
342 }
343 }
344
345 $success = &any_to_text($input_filename, $output_filestem);
346 if ($success) {
347 return "text";
348 }
349
350 return "fail";
351}
352
353
354sub convertXLS {
355 my ($input_filename, $output_filestem, $output_type) = @_;
356
357 my $success = 0;
358
359 # Attempt conversion to HTML
360 if (!$output_type || ($output_type =~ /html/i)) {
361 # formulate the command
362 $cmd = "";
363 $cmd .= "perl -S xlstohtml.pl ";
364 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
365 $cmd .= " 2>\"$output_filestem.err\""
366 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
367
368
369 # execute the command
370 $!=0;
371 if (system($cmd)!=0)
372 {
373 print STDERR "Excel 95/97 converter failed $!\n";
374 } else {
375 return "html";
376 }
377 }
378
379 $success = &any_to_text($input_filename, $output_filestem);
380 if ($success) {
381 return "text";
382 }
383
384 return "fail";
385}
386
387
388
389
390
391# Find the real type of a .doc file
392#
393# We seem to have a lot of files with a .doc extension that are .rtf
394# files or Word 5 files. This function attempts to tell the difference.
395
396sub find_docfile_type {
397 ($input_filename) = @_;
398
399 open(CHK, "<$input_filename");
400 binmode(CHK);
401 my $line = "";
402 my $first = 1;
403
404 while (<CHK>) {
405
406 $line = $_;
407
408 if ($first) {
409 # check to see if this is an rtf file
410 if ($line =~ /^\{\\rtf/) {
411 close(CHK);
412 return "rtf";
413 }
414 $first = 0;
415 }
416
417 # is this is a word 6/7/8 document?
418 if ($line =~ /Word\.Document\.([678])/) {
419 close(CHK);
420 return "word$1";
421 }
422
423 }
424
425 return "unknown";
426}
427
428
429
430# Specific type-to-type conversions
431#
432# Each of the following functions attempts to convert a document from
433# a specific format to another. If they succeed they return 1 and leave
434# the output document(s) in the appropriate place; if they fail they
435# return 0 and delete any working files.
436
437
438# Attempt to convert a word document to html with the wv program
439
440sub doc_to_html {
441 ($input_filename, $output_filestem) = @_;
442
443 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
444 $ENV{'GSDLOS'}, "wvWare");
445
446 # don't include path on windows (to avoid having to play about
447 # with quoting when GSDLHOME might contain spaces) but assume
448 # that the PATH is set up correctly
449 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
450
451 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
452 "packages", "wv", "wvHtml.xml");
453
454 my $cmd = "";
455 if ($timeout) {$cmd = "ulimit -t $timeout;";}
456 $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
457 $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
458
459 # redirecting STDERR is a bad idea on windows 95/98
460 $cmd .= " 2> \"$output_filestem.err\""
461 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
462
463 # execute the command
464 $!=0;
465 if (system($cmd)!=0)
466 {
467 print STDERR "Error executing wv converter:$!\n";
468 if (-s "$output_filestem.err") {
469 open (ERRFILE, "<$output_filestem.err");
470
471 my $write_to_fail_log=0;
472 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
473 {$write_to_fail_log=1;}
474
475 my $line;
476 while ($line=<ERRFILE>) {
477 if ($line =~ /\w/) {
478 print STDERR "$line";
479 print FAILLOG "$line" if ($write_to_fail_log);
480 }
481 if ($line !~ m/startup error/) {next;}
482 print STDERR " (given an invalid .DOC file?)\n";
483 print FAILLOG " (given an invalid .DOC file?)\n"
484 if ($write_to_fail_log);
485
486 } # while ERRFILE
487 close FAILLOG if ($write_to_fail_log);
488 }
489 return 0; # we can try any_to_text
490 }
491
492 # Was the conversion successful?
493
494 if (-s "$output_filestem.html") {
495 open(TMP, "$output_filestem.html");
496 $line = <TMP>;
497 close(TMP);
498 if ($line && $line =~ /DOCTYPE HTML/) {
499 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
500 return 1;
501 }
502 }
503
504 # If here, an error of some sort occurred
505 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
506 if (-e "$output_filestem.err") {
507 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
508 open (ERRLOG,"$output_filestem.err");
509 while (<ERRLOG>) {print FAILLOG $_;}
510 close FAILLOG;
511 close ERRLOG;
512 }
513 &util::rm("$output_filestem.err");
514 }
515
516 return 0;
517}
518
519
520# Attempt to convert an RTF document to html with rtftohtml
521
522sub rtf_to_html {
523 my ($input_filename, $output_filestem) = @_;
524
525 # formulate the command
526 $cmd = "";
527 if ($timeout) {$cmd = "ulimit -t $timeout;";}
528 $cmd .= "rtftohtml";
529
530 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
531
532 $cmd .= " 2>\"$output_filestem.err\""
533 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
534
535
536 # execute the command
537 $!=0;
538 if (system($cmd)!=0)
539 {
540 print STDERR "Error executing rtf converter $!\n";
541 # don't currently bother printing out error log...
542 # keep going, in case it still created an HTML file...
543 }
544
545 # Was the conversion successful?
546 my $was_successful=0;
547 if (-s "$output_filestem.html") {
548 # make sure we have some content other than header
549 open (HTML, "$output_filestem.html"); # what to do if fail?
550 my $line;
551 my $past_header=0;
552 while ($line=<HTML>) {
553
554 if ($past_header == 0) {
555 if ($line =~ /<body>/) {$past_header=1;}
556 next;
557 }
558
559 $line =~ s/<[^>]+>//g;
560 if ($line =~ /\w/ && $past_header) { # we found some content...
561 $was_successful=1;
562 last;
563 }
564 }
565 close HTML;
566 }
567
568 if ($was_successful) {
569 &util::rm("$output_filestem.err")
570 if (-e "$output_filestem.err");
571 # insert the (modified) table of contents, if it exists.
572 if (-e "${output_filestem}_ToC.html") {
573 &util::mv("$output_filestem.html","$output_filestem.src");
574 my $open_failed=0;
575 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
576 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
577 open HTML, ">$output_filestem.html" || ++$open_failed;
578
579 if ($open_failed) {
580 close HTMLSRC;
581 close TOC;
582 close HTML;
583 &util::mv("$output_filestem.src","$output_filestem.html");
584 return 1;
585 }
586
587 # print out header info from src html.
588 while (defined($_ = <HTMLSRC>) && $_ =~ /\w/) {
589 print HTML "$_";
590 }
591
592 # print out table of contents, making links relative
593 <TOC>; <TOC>; # ignore first 2 lines
594 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
595 my $line;
596 while ($line=<TOC>) {
597 $line =~ s@</body></html>$@@ ; # only last line has this
598 # make link relative
599 $line =~ s@href=\"[^\#]+@href=\"@;
600 print HTML $line;
601 }
602 close TOC;
603
604 # rest of html src
605 while (<HTMLSRC>) {
606 print HTML $_;
607 }
608 close HTMLSRC;
609 close HTML;
610
611 &util::rm("${output_filestem}_ToC.html");
612 &util::rm("${output_filestem}.src");
613 }
614 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
615 return 1; # success
616 }
617
618 if (-e "$output_filestem.err") {
619 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
620 {
621 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
622 print FAILLOG " (rtf file might be too recent):\n";
623 open (ERRLOG, "$output_filestem.err");
624 while (<ERRLOG>) {print FAILLOG $_;}
625 close ERRLOG;
626 close FAILLOG;
627 }
628 &util::rm("$output_filestem.err");
629 }
630
631 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
632
633 return 0;
634}
635
636
637# Convert a pdf file to html with the pdftohtml command
638
639sub pdf_to_html {
640 my ($dirname, $input_filename, $output_filestem) = @_;
641
642 $cmd = "";
643 if ($timeout) {$cmd = "ulimit -t $timeout;";}
644 $cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
645 $cmd .= " -c" if ($pdf_complex);
646 $cmd .= " -i" if ($pdf_ignore_images);
647 $cmd .= " -hidden" unless ($pdf_nohidden);
648 $cmd .= " \"$input_filename\" \"$output_filestem\"";
649
650 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {
651 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
652 } else {
653 $cmd .= " > \"$output_filestem.err\"";
654 }
655
656 $!=0;
657
658 my $retval=system($cmd);
659 if ($retval!=0)
660 {
661 print STDERR "Error executing pdftohtml.pl";
662 if ($!) {print STDERR ": $!";}
663 print STDERR "\n";
664 }
665
666 # make sure the converter made something
667 if ($retval!=0 || ! -s "$output_filestem.html")
668 {
669 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
670 # print out the converter's std err, if any
671 if (-s "$output_filestem.err") {
672 open (ERRLOG, "$output_filestem.err") || die "$!";
673 print STDERR "pdftohtml error log:\n";
674 while (<ERRLOG>) {
675 print STDERR "$_";
676 }
677 close ERRLOG;
678 }
679 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
680 if (-e "$output_filestem.err") {
681 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
682 {
683 open (ERRLOG, "$output_filestem.err");
684 while (<ERRLOG>) {print FAILLOG $_;}
685 close ERRLOG;
686 close FAILLOG;
687 }
688 &util::rm("$output_filestem.err");
689 }
690 return 0;
691 }
692
693 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
694 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
695 return 1;
696}
697
698# Convert a PDF file to text with the pdftotext command
699
700sub pdf_to_text {
701 my ($dirname, $input_filename, $output_filestem) = @_;
702
703 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
704
705 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
706 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
707 } else {
708 $cmd .= " > \"$output_filestem.err\"";
709 }
710
711 if (system($cmd)!=0)
712 {
713 print STDERR "Error executing $cmd: $!\n";
714 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
715 }
716
717 # make sure there is some extracted text.
718 if (-e "$output_filestem.text") {
719 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
720 binmode(EXTR_TEXT); # just in case...
721 my $line="";
722 my $seen_text=0;
723 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
724 if ($line=~ /\w/) {$seen_text=1;}
725 }
726 close EXTR_TEXT;
727 if ($seen_text==0) { # no text was extracted
728 print STDERR "Error: pdftotext found no text\n";
729 &util::rm("$output_filestem.text");
730 }
731 }
732
733 # make sure the converter made something
734 if (! -s "$output_filestem.text")
735 {
736 # print out the converters std err, if any
737 if (-s "$output_filestem.err") {
738 open (ERRLOG, "$output_filestem.err") || die "$!";
739 print STDERR "pdftotext error log:\n";
740 while (<ERRLOG>) {
741 print STDERR "$_";
742 }
743 close ERRLOG;
744 }
745 # does this converter create a .out file?
746 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
747 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
748 if (-e "$output_filestem.err") {
749 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
750 {
751 open (ERRLOG,"$output_filestem.err");
752 while (<ERRLOG>) {print FAILLOG $_;}
753 close ERRLOG;
754 close FAILLOG;
755 }
756 &util::rm("$output_filestem.err");
757 }
758 return 0;
759 }
760 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
761 return 1;
762}
763
764# Convert a PostScript document to text
765# note - just using "ps2ascii" isn't good enough, as it
766# returns 0 for a postscript interpreter error. ps2ascii is just
767# a wrapper to "gs" anyway, so we use that cmd here.
768
769sub ps_to_text {
770 my ($input_filename, $output_filestem) = @_;
771
772 my $error = "";
773
774 # if we're on windows we'll fall straight through without attempting
775 # to use gs
776 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
777 $error = "Windows does not support gs";
778
779 } else {
780 my $cmd = "";
781 if ($timeout) {$cmd = "ulimit -t $timeout; ";}
782 $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
783 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
784 $cmd .= " 2> $output_filestem.err";
785 $!=0;
786
787 my $retcode=system($cmd);
788 $retcode = $? >> 8; # see man perlfunc - system for this...
789 # if system returns -1 | 127 (couldn't start program), look at $! for message
790
791 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
792 elsif (! -e "$output_filestem.text") {
793 $error="did not create output file.\n";
794 }
795 else
796 { # make sure the interpreter didn't get an error. It is technically
797 # possible for the actual text to start with this, but....
798 open PSOUT, "$output_filestem.text";
799 if (<PSOUT> =~ /^Error: (.*)/) {
800 $error="interpreter error - \"$1\"";
801 }
802 close PSOUT;
803 }
804 }
805
806 if ($error ne "")
807 {
808 print STDERR "Warning: Error executing gs: $error\n";
809 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
810
811 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
812 {
813 print FAILLOG "gs - $error\n";
814 if (-e "$output_filestem.err") {
815 open(ERRLOG, "$output_filestem.err");
816 while (<ERRLOG>) {print FAILLOG $_;}
817 close ERRLOG;
818 }
819 close FAILLOG;
820 }
821 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
822
823
824 # Fine then. We'll just do a lousy job by ourselves...
825 # Based on 5-line regexp sed script found at:
826 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
827 #
828 print STDERR "Stripping text from postscript\n";
829 my $errorcode=0;
830 open (IN, "$input_filename")
831 || ($errorcode=1, warn "Couldn't read file: $!");
832 open (OUT, ">$output_filestem.text")
833 || ($errorcode=1, warn "Couldn't write file: $!");
834 if ($errorcode) {print STDERR "errors\n";return 0;}
835
836 my $text=""; # this is for whole .ps file...
837 $text = join('', <IN>); # see man perlport, under "System Resources"
838 close IN;
839
840 # Make sure this is a ps file...
841 if ($text !~ /^%!/) {
842 print STDERR "Bad postscript header: not '%!'\n";
843 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
844 {
845 print FAILLOG "Bad postscript header: not '%!'\n";
846 close FAILLOG;
847 }
848 return 0;
849 }
850
851 # if ps has Page data, then use it to delete all stuff before it.
852 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
853
854 # remove all leading non-data stuff
855 $text =~ s/^.*?\(//s;
856
857 # remove all newline chars for easier processing
858 $text =~ s/\n//g;
859
860 # Big assumption here - assume that if any co-ordinates are
861 # given, then we are at the end of a sentence.
862 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
863
864 # special characters--
865 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
866
867 # ? ps text formatting (eg italics?) ?
868 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
869 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
870 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
871 # default - remove the rest
872 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
873
874 # attempt to add whitespace between words...
875 # this is based purely on observation, and may be completely wrong...
876 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
877 # eg I notice "b(" is sometimes NOT a space if preceded by a
878 # negative number.
879 $text =~ s/\)\d+ ?b\(/\) \( /g;
880
881 # change quoted braces to brackets
882 $text =~ s/([^\\])\\\(/$1\{/g;
883 $text =~ s/([^\\])\\\)/$1\}/g ;
884
885 # remove everything that is not between braces
886 $text =~ s/\)([^\(\)])+?\(//sg ;
887
888 # remove any Trailer eof stuff.
889 $text =~ s/\)[^\)]*$//sg;
890
891 ### ligatures have special characters...
892 $text =~ s/\\013/ff/g;
893 $text =~ s/\\014/fi/g;
894 $text =~ s/\\015/fl/g;
895 $text =~ s/\\016/ffi/g;
896 $text =~ s/\\214/fi/g;
897 $text =~ s/\\215/fl/g;
898 $text =~ s/\\017/\n\* /g; # asterisk?
899 $text =~ s/\\023/\023/g; # e acute ('e)
900 $text =~ s/\\177/\252/g; # u"
901# $text =~ s/ ?? /\344/g; # a"
902
903 print OUT "$text";
904 close OUT;
905 }
906 # wrap the text - use a minimum length. ie, first space after this length.
907 my $wrap_length=72;
908 &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
909 open INFILE, "$output_filestem.text.tmp" ||
910 die "Couldn't open file: $!";
911 open OUTFILE, ">$output_filestem.text" ||
912 die "Couldn't open file for writing: $!";
913 my $line="";
914 while ($line=<INFILE>) {
915 while (length($line)>0) {
916 if (length($line)>$wrap_length) {
917 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
918 print OUTFILE "$1\n";
919 } else {
920 print OUTFILE "$line";
921 $line="";
922 }
923 }
924 }
925 close INFILE;
926 close OUTFILE;
927 &util::rm("$output_filestem.text.tmp");
928
929 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
930 return 1;
931}
932
933
934# Convert any file to HTML with a crude perl implementation of the
935# UNIX strings command.
936
937sub any_to_html {
938 ($input_filename, $output_filestem) = @_;
939
940 # First generate a text file
941 return 0 unless (&any_to_text($input_filename, $output_filestem));
942
943 # create an HTML file from the text file
944 open(TEXT, "<$output_filestem.text");
945 open(HTML, ">$output_filestem.html");
946
947 print HTML "<html><head>\n";
948 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
949 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
950 print HTML "</head><body>\n\n";
951
952 my $line;
953 while ($line=<TEXT>) {
954 $line =~ s/</&lt;/g;
955 $line =~ s/>/&gt;/g;
956 if ($line =~ /^\s*$/) {
957 print HTML "<p>";
958 } else {
959 print HTML "<br> ", $line;
960 }
961 }
962 print HTML "\n</body></html>\n";
963
964 close HTML;
965 close TEXT;
966
967 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
968 return 1;
969}
970
971# Convert any file to TEXT with a crude perl implementation of the
972# UNIX strings command.
973# Note - this assumes ascii charsets :( (jrm21)
974
975sub any_to_text {
976 ($input_filename, $output_filestem) = @_;
977
978 if (!$use_strings) {
979 return 0;
980 }
981
982 open(IN, "<$input_filename") || return 0;
983 binmode(IN);
984 open(OUT, ">$output_filestem.text") || return 0;
985
986 my ($line);
987 my $output_line_count = 0;
988 while (<IN>) {
989 $line = $_;
990
991 # delete anything that isn't a printable character
992 $line =~ s/[^\040-\176]+/\n/sg;
993
994 # delete any string less than 10 characters long
995 $line =~ s/^.{0,9}$/\n/mg;
996 while ($line =~ /^.{1,9}$/m) {
997 $line =~ s/^.{0,9}$/\n/mg;
998 $line =~ s/\n+/\n/sg;
999 }
1000
1001 # remove extraneous whitespace
1002 $line =~ s/\n+/\n/gs;
1003 $line =~ s/^\n//gs;
1004
1005 # output whatever is left
1006 if ($line =~ /[^\n ]/) {
1007 print OUT $line;
1008 ++$output_line_count;
1009 }
1010 }
1011
1012 close OUT;
1013 close IN;
1014
1015 if ($output_line_count) { # try to protect against binary only formats
1016 return 1;
1017 }
1018
1019 &util::rm("$output_filestem.text");
1020 return 0;
1021
1022}
Note: See TracBrowser for help on using the repository browser.