source: trunk/gsdl/bin/script/gsConvert.pl@ 3246

Last change on this file since 3246 was 3246, checked in by jrm21, 22 years ago

RTF files that end in .doc were converted to $filestem.doc.html by default,
so we force the output filename instead.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 26.3 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use parsargv;
52use util;
53use Cwd;
54use File::Basename;
55
56# Are we running on WinNT or Win2000 (or later)?
57my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
58if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
59
60sub print_usage
61{
62 print STDERR "\n";
63 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
64 print STDERR " or text using third-party programs.\n\n";
65 print STDERR " usage: $0 [options] filename\n";
66 print STDERR " options:\n\t-type\tdoc|pdf|ps|ppt|rtf|xls\t(input file type)\n";
67 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
68 print STDERR "\t-output\thtml|text\n";
69 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
70 exit(1);
71}
72
73my $faillogfile="";
74
75sub main
76{
77 my (@ARGV) = @_;
78 my ($input_type,$output_type,$verbose,$timeout);
79
80 $timeout = 0;
81 # read command-line arguments
82 if (!parsargv::parse(\@ARGV,
83 'type/(doc|pdf|ps|ppt|rtf|xls)/', \$input_type,
84 '/errlog/.*/', \$faillogfile,
85 'output/(html|text)/', \$output_type,
86 'timeout/\d+/0',\$timeout,
87 'verbose/\d+/0', \$verbose))
88 {
89 print_usage();
90 }
91
92 # Make sure the input file exists and can be opened for reading
93 if (scalar(@ARGV!=1)) {
94 print_usage();
95 }
96
97 my $input_filename = $ARGV[0];
98 if (!-r $input_filename) {
99 print STDERR "Error: unable to open $input_filename for reading\n";
100 exit(1);
101 }
102
103 # Deduce filenames
104 my ($tailname,$dirname,$suffix)
105 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
106 my $output_filestem = &util::filename_cat($dirname, "$tailname");
107
108 if ($input_type eq "")
109 {
110 $input_type = lc (substr($suffix,1,length($suffix)-1));
111 }
112
113 # Change to temporary working directory
114 my $stored_dir = cwd();
115 chdir ($dirname) || die "Unable to change to directory $dirname";
116
117 # Select convert utility
118 if (!defined $input_type) {
119 print STDERR "Error: No filename extension or input type defined\n";
120 exit(1);
121 }
122 elsif ($input_type eq "doc") {
123 print &convertDOC($input_filename, $output_filestem, $output_type);
124 print "\n";
125 }
126 elsif ($input_type eq "rtf") {
127 print &convertRTF($input_filename, $output_filestem, $output_type);
128 print "\n";
129 }
130 elsif ($input_type eq "pdf") {
131 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
132 print "\n";
133 }
134 elsif ($input_type eq "ps") {
135 print &convertPS($input_filename, $output_filestem, $output_type);
136 print "\n";
137 }
138 elsif ($input_type eq "ppt") {
139 print &convertPPT($input_filename, $output_filestem, $output_type);
140 print "\n";
141 }
142 elsif ($input_type eq "xls") {
143 print &convertXLS($input_filename, $output_filestem, $output_type);
144 print "\n";
145 }
146 else {
147 print STDERR "Error: Unable to convert type '$input_type'\n";
148 exit(1);
149 }
150
151 # restore to original working directory
152 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
153
154}
155
156&main(@ARGV);
157
158
159
160# Document-type conversion functions
161#
162# The following functions attempt to convert documents from their
163# input type to the specified output type. If no output type was
164# given, then they first attempt HTML, and then TEXT.
165#
166# Each returns the output type ("html" or "text") or "fail" if no
167# conversion is possible.
168
169# Convert a Microsoft word document
170
171sub convertDOC {
172 ($input_filename, $output_filestem, $output_type) = @_;
173
174 # Many .doc files are not in fact word documents!
175 my $realtype = &find_docfile_type($input_filename);
176
177 if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
178 return &convertWord678($input_filename, $output_filestem, $output_type);
179 } elsif ($realtype eq "rtf") {
180 return &convertRTF($input_filename, $output_filestem, $output_type);
181 } else {
182 return &convertAnything($input_filename, $output_filestem, $output_type);
183 }
184}
185
186# Convert a Microsoft word 6/7/8 document
187
188sub convertWord678 {
189 ($input_filename, $output_filestem, $output_type) = @_;
190
191 my $success = 0;
192
193 # Attempt specialised conversion to HTML
194 if (!$output_type || ($output_type =~ /html/i)) {
195 $success = &doc_to_html($input_filename, $output_filestem);
196 if ($success) {
197 return "html";
198 }
199 }
200
201 return &convertAnything($input_filename, $output_filestem, $output_type);
202}
203
204
205# Convert a Rich Text Format (RTF) file
206
207sub convertRTF {
208 ($input_filename, $output_filestem, $output_type) = @_;
209
210 my $success = 0;
211
212 # Attempt specialised conversion to HTML
213 if (!$output_type || ($output_type =~ /html/i)) {
214 $success = &rtf_to_html($input_filename, $output_filestem);
215 if ($success) {
216 return "html";
217 }
218 }
219
220# rtf is so ugly that's it's not worth running strings over.
221# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
222# return &convertAnything($input_filename, $output_filestem, $output_type);
223 return "fail";
224}
225
226
227# Convert an unidentified file
228
229sub convertAnything {
230 ($input_filename, $output_filestem, $output_type) = @_;
231
232 my $success = 0;
233
234 # Attempt simple conversion to HTML
235 if (!$output_type || ($output_type =~ /html/i)) {
236 $success = &any_to_html($input_filename, $output_filestem);
237 if ($success) {
238 return "html";
239 }
240 }
241
242 # Convert to text
243 if (!$output_type || ($output_type =~ /text/i)) {
244 $success = &any_to_text($input_filename, $output_filestem);
245 if ($success) {
246 return "text";
247 }
248 }
249 return "fail";
250}
251
252
253
254# Convert an Adobe PDF document
255
256sub convertPDF {
257 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
258
259 my $success = 0;
260
261 # Attempt conversion to HTML
262 if (!$output_type || ($output_type =~ /html/i)) {
263 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
264 if ($success) {
265 return "html";
266 }
267 }
268
269 # Attempt conversion to TEXT
270 if (!$output_type || ($output_type =~ /text/i)) {
271 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
272 if ($success) {
273 return "text";
274 }
275 }
276
277 return "fail";
278
279}
280
281
282# Convert an Adobe PostScript document
283
284sub convertPS {
285 ($input_filename, $output_filestem, $output_type) = @_;
286
287 my $success = 0;
288
289 # Attempt conversion to TEXT
290 if (!$output_type || ($output_type =~ /text/i)) {
291 $success = &ps_to_text($input_filename, $output_filestem);
292 if ($success) {
293 return "text";
294 }
295 }
296
297 return "fail";
298
299}
300
301
302sub convertPPT {
303 my ($input_filename, $output_filestem, $output_type) = @_;
304
305 my $success = 0;
306
307 # Attempt conversion to HTML
308 if (!$output_type || ($output_type =~ /html/i)) {
309 # formulate the command
310 $cmd = "";
311 $cmd .= "perl -S ppttohtml.pl ";
312 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
313 $cmd .= " 2>\"$output_filestem.err\""
314 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
315
316
317 # execute the command
318 $!=0;
319 if (system($cmd)!=0)
320 {
321 print STDERR "Powerpoint 95/97 converter failed $!\n";
322 } else {
323 return "html";
324 }
325 }
326
327 $success = &any_to_text($input_filename, $output_filestem);
328 if ($success) {
329 return "text";
330 }
331
332 return "fail";
333}
334
335
336sub convertXLS {
337 my ($input_filename, $output_filestem, $output_type) = @_;
338
339 my $success = 0;
340
341 # Attempt conversion to HTML
342 if (!$output_type || ($output_type =~ /html/i)) {
343 # formulate the command
344 $cmd = "";
345 $cmd .= "perl -S xlstohtml.pl ";
346 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
347 $cmd .= " 2>\"$output_filestem.err\""
348 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
349
350
351 # execute the command
352 $!=0;
353 if (system($cmd)!=0)
354 {
355 print STDERR "Excel 95/97 converter failed $!\n";
356 } else {
357 return "html";
358 }
359 }
360
361 $success = &any_to_text($input_filename, $output_filestem);
362 if ($success) {
363 return "text";
364 }
365
366 return "fail";
367}
368
369
370
371
372
373# Find the real type of a .doc file
374#
375# We seem to have a lot of files with a .doc extension that are .rtf
376# files or Word 5 files. This function attempts to tell the difference.
377
378sub find_docfile_type {
379 ($input_filename) = @_;
380
381 open(CHK, "<$input_filename");
382 binmode(CHK);
383 my $line = "";
384 my $first = 1;
385
386 while (<CHK>) {
387
388 $line = $_;
389
390 if ($first) {
391 # check to see if this is an rtf file
392 if ($line =~ /^\{\\rtf/) {
393 close(CHK);
394 return "rtf";
395 }
396 $first = 0;
397 }
398
399 # is this is a word 6/7/8 document?
400 if ($line =~ /Word\.Document\.([678])/) {
401 close(CHK);
402 return "word$1";
403 }
404
405 }
406
407 return "unknown";
408}
409
410
411
412# Specific type-to-type conversions
413#
414# Each of the following functions attempts to convert a document from
415# a specific format to another. If they succeed they return 1 and leave
416# the output document(s) in the appropriate place; if they fail they
417# return 0 and delete any working files.
418
419
420# Attempt to convert a word document to html with the wv program
421
422sub doc_to_html {
423 ($input_filename, $output_filestem) = @_;
424
425 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
426 $ENV{'GSDLOS'}, "wvWare");
427
428 # don't include path on windows (to avoid having to play about
429 # with quoting when GSDLHOME might contain spaces) but assume
430 # that the PATH is set up correctly
431 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
432
433 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
434 "packages", "wv", "wvHtml.xml");
435
436 my $cmd = "";
437 if ($timeout) {$cmd = "ulimit -t $timeout;";}
438 $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
439 $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
440
441 # redirecting STDERR is a bad idea on windows 95/98
442 $cmd .= " 2> \"$output_filestem.err\""
443 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
444
445 # execute the command
446 $!=0;
447 if (system($cmd)!=0)
448 {
449 print STDERR "Error executing wv converter:$!\n";
450 if (-s "$output_filestem.err") {
451 open (ERRFILE, "<$output_filestem.err");
452
453 my $write_to_fail_log=0;
454 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
455 {$write_to_fail_log=1;}
456
457 my $line;
458 while ($line=<ERRFILE>) {
459 if ($line =~ /\w/) {
460 print STDERR "$line";
461 print FAILLOG "$line" if ($write_to_fail_log);
462 }
463 if ($line !~ m/startup error/) {next;}
464 print STDERR " (given an invalid .DOC file?)\n";
465 print FAILLOG " (given an invalid .DOC file?)\n"
466 if ($write_to_fail_log);
467
468 } # while ERRFILE
469 close FAILLOG if ($write_to_fail_log);
470 }
471 print STDERR "Continuing...\n";
472 return 0; # we can try any_to_text
473 }
474
475 # Was the conversion successful?
476
477 if (-s "$output_filestem.html") {
478 open(TMP, "$output_filestem.html");
479 $line = <TMP>;
480 close(TMP);
481 if ($line && $line =~ /DOCTYPE HTML/) {
482 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
483 return 1;
484 }
485 }
486
487 # If here, an error of some sort occurred
488 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
489 if (-e "$output_filestem.err") {
490 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
491 open (ERRLOG,"$output_filestem.err");
492 while (<ERRLOG>) {print FAILLOG $_;}
493 close FAILLOG;
494 close ERRLOG;
495 }
496 &util::rm("$output_filestem.err");
497 }
498
499 return 0;
500}
501
502
503# Attempt to convert an RTF document to html with rtftohtml
504
505sub rtf_to_html {
506 my ($input_filename, $output_filestem) = @_;
507
508 # formulate the command
509 $cmd = "";
510 if ($timeout) {$cmd = "ulimit -t $timeout;";}
511 $cmd .= "rtftohtml";
512
513 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
514
515 $cmd .= " 2>\"$output_filestem.err\""
516 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
517
518
519 # execute the command
520 $!=0;
521 if (system($cmd)!=0)
522 {
523 print STDERR "Error executing rtf converter $!\n";
524 # don't currently bother printing out error log...
525 # keep going, in case it still created an HTML file...
526 }
527
528 # Was the conversion successful?
529 my $was_successful=0;
530 if (-s "$output_filestem.html") {
531 # make sure we have some content other than header
532 open (HTML, "$output_filestem.html"); # what to do if fail?
533 my $line;
534 my $past_header=0;
535 while ($line=<HTML>) {
536
537 if ($past_header == 0) {
538 if ($line =~ /<body>/) {$past_header=1;}
539 next;
540 }
541
542 $line =~ s/<[^>]+>//g;
543 if ($line =~ /\w/ && $past_header) { # we found some content...
544 $was_successful=1;
545 last;
546 }
547 }
548 close HTML;
549 }
550
551 if ($was_successful) {
552 &util::rm("$output_filestem.err")
553 if (-e "$output_filestem.err");
554 # insert the (modified) table of contents, if it exists.
555 if (-e "${output_filestem}_ToC.html") {
556 &util::mv("$output_filestem.html","$output_filestem.src");
557 my $open_failed=0;
558 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
559 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
560 open HTML, ">$output_filestem.html" || ++$open_failed;
561
562 if ($open_failed) {
563 close HTMLSRC;
564 close TOC;
565 close HTML;
566 &util::mv("$output_filestem.src","$output_filestem.html");
567 return 1;
568 }
569
570 # print out header info from src html.
571 while (($_ = <HTMLSRC>) =~ /\w/) {
572 print HTML "$_";
573 }
574
575 # print out table of contents, making links relative
576 <TOC>; <TOC>; # ignore first 2 lines
577 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
578 my $line;
579 while ($line=<TOC>) {
580 $line =~ s@</body></html>$@@ ; # only last line has this
581 # make link relative
582 $line =~ s@href=\"[^\#]+@href=\"@;
583 print HTML $line;
584 }
585 close TOC;
586
587 # rest of html src
588 while (<HTMLSRC>) {
589 print HTML $_;
590 }
591 close HTMLSRC;
592 close HTML;
593
594 &util::rm("${output_filestem}_ToC.html");
595 &util::rm("${output_filestem}.src");
596 }
597 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
598 return 1; # success
599 }
600
601 if (-e "$output_filestem.err") {
602 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
603 {
604 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
605 print FAILLOG " (rtf file might be too recent):\n";
606 open (ERRLOG, "$output_filestem.err");
607 while (<ERRLOG>) {print FAILLOG $_;}
608 close ERRLOG;
609 close FAILLOG;
610 }
611 &util::rm("$output_filestem.err");
612 }
613
614 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
615
616 return 0;
617}
618
619
620# Convert a pdf file to html with the pdftohtml command
621
622sub pdf_to_html {
623 my ($dirname, $input_filename, $output_filestem) = @_;
624
625 $cmd = "";
626 if ($timeout) {$cmd = "ulimit -t $timeout;";}
627 $cmd .= "perl -S pdftohtml.pl ";
628 $cmd .= " \"$input_filename\" \"$output_filestem\"";
629
630 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {
631 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
632 } else {
633 $cmd .= " > \"$output_filestem.err\"";
634 }
635
636 $!=0;
637
638 my $retval=system($cmd);
639 if ($retval!=0)
640 {
641 print STDERR "Error executing pdftohtml.pl";
642 if ($!) {print STDERR ": $!";}
643 print STDERR "\n";
644 }
645
646 # make sure the converter made something
647 if ($retval!=0 || ! -s "$output_filestem.html")
648 {
649 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
650 # print out the converter's std err, if any
651 if (-s "$output_filestem.err") {
652 open (ERRLOG, "$output_filestem.err") || die "$!";
653 print STDERR "pdftohtml error log:\n";
654 while (<ERRLOG>) {
655 print STDERR "$_";
656 }
657 close ERRLOG;
658 }
659 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
660 if (-e "$output_filestem.err") {
661 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
662 {
663 open (ERRLOG, "$output_filestem.err");
664 while (<ERRLOG>) {print FAILLOG $_;}
665 close ERRLOG;
666 close FAILLOG;
667 }
668 &util::rm("$output_filestem.err");
669 }
670 return 0;
671 }
672
673 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
674 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
675 return 1;
676}
677
678# Convert a PDF file to text with the pdftotext command
679
680sub pdf_to_text {
681 my ($dirname, $input_filename, $output_filestem) = @_;
682
683 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
684
685 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
686 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
687 } else {
688 $cmd .= " > \"$output_filestem.err\"";
689 }
690
691 if (system($cmd)!=0)
692 {
693 print STDERR "Error executing $cmd: $!\n";
694 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
695 }
696
697 # make sure there is some extracted text.
698 if (-e "$output_filestem.text") {
699 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
700 binmode(EXTR_TEXT); # just in case...
701 my $line="";
702 my $seen_text=0;
703 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
704 if ($line=~ /\w/) {$seen_text=1;}
705 }
706 close EXTR_TEXT;
707 if ($seen_text==0) { # no text was extracted
708 print STDERR "Error: pdftotext found no text\n";
709 &util::rm("$output_filestem.text");
710 }
711 }
712
713 # make sure the converter made something
714 if (! -s "$output_filestem.text")
715 {
716 # print out the converters std err, if any
717 if (-s "$output_filestem.err") {
718 open (ERRLOG, "$output_filestem.err") || die "$!";
719 print STDERR "pdftotext error log:\n";
720 while (<ERRLOG>) {
721 print STDERR "$_";
722 }
723 close ERRLOG;
724 }
725 # does this converter create a .out file?
726 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
727 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
728 if (-e "$output_filestem.err") {
729 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
730 {
731 open (ERRLOG,"$output_filestem.err");
732 while (<ERRLOG>) {print FAILLOG $_;}
733 close ERRLOG;
734 close FAILLOG;
735 }
736 &util::rm("$output_filestem.err");
737 }
738 return 0;
739 }
740 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
741 return 1;
742}
743
744# Convert a PostScript document to text
745# note - just using "ps2ascii" isn't good enough, as it
746# returns 0 for a postscript interpreter error. ps2ascii is just
747# a wrapper to "gs" anyway, so we use that cmd here.
748
749sub ps_to_text {
750 my ($input_filename, $output_filestem) = @_;
751
752 my $error = "";
753
754 # if we're on windows we'll fall straight through without attempting
755 # to use gs
756 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
757 $error = "Windows does not support gs";
758
759 } else {
760 my $cmd = "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
761 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
762 $cmd .= " 2> $output_filestem.err";
763 $!=0;
764
765 my $retcode=system($cmd);
766 $retcode = $? >> 8; # see man perlfunc - system for this...
767 # if system returns -1 | 127 (couldn't start program), look at $! for message
768
769 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
770 elsif (! -e "$output_filestem.text") {
771 $error="did not create output file.\n";
772 }
773 else
774 { # make sure the interpreter didn't get an error. It is technically
775 # possible for the actual text to start with this, but....
776 open PSOUT, "$output_filestem.text";
777 if (<PSOUT> =~ /^Error: (.*)/) {
778 $error="interpreter error - \"$1\"";
779 }
780 close PSOUT;
781 }
782 }
783
784 if ($error ne "")
785 {
786 print STDERR "Warning: Error executing gs: $error\n";
787 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
788
789 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
790 {
791 print FAILLOG "gs - $error\n";
792 if (-e "$output_filestem.err") {
793 open(ERRLOG, "$output_filestem.err");
794 while (<ERRLOG>) {print FAILLOG $_;}
795 close ERRLOG;
796 }
797 close FAILLOG;
798 }
799 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
800
801
802 # Fine then. We'll just do a lousy job by ourselves...
803 # Based on 5-line regexp sed script found at:
804 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
805 #
806 print STDERR "Stripping text from postscript\n";
807 my $errorcode=0;
808 open (IN, "$input_filename")
809 || ($errorcode=1, warn "Couldn't read file: $!");
810 open (OUT, ">$output_filestem.text")
811 || ($errorcode=1, warn "Couldn't write file: $!");
812 if ($errorcode) {print STDERR "errors\n";return 0;}
813
814 my $text=""; # this is for whole .ps file...
815 $text = join('', <IN>); # see man perlport, under "System Resources"
816 close IN;
817
818 # Make sure this is a ps file...
819 if ($text !~ /^%!/) {
820 print STDERR "Bad postscript header: not '%!'\n";
821 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
822 {
823 print FAILLOG "Bad postscript header: not '%!'\n";
824 close FAILLOG;
825 }
826 return 0;
827 }
828
829 # if ps has Page data, then use it to delete all stuff before it.
830 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
831
832 # remove all leading non-data stuff
833 $text =~ s/^.*?\(//s;
834
835 # remove all newline chars for easier processing
836 $text =~ s/\n//g;
837
838 # Big assumption here - assume that if any co-ordinates are
839 # given, then we are at the end of a sentence.
840 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
841
842 # special characters--
843 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
844
845 # ? ps text formatting (eg italics?) ?
846 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
847 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
848 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
849 # default - remove the rest
850 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
851
852 # attempt to add whitespace between words...
853 # this is based purely on observation, and may be completely wrong...
854 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
855 # eg I notice "b(" is sometimes NOT a space if preceded by a
856 # negative number.
857 $text =~ s/\)\d+ ?b\(/\) \( /g;
858
859 # change quoted braces to brackets
860 $text =~ s/([^\\])\\\(/$1\{/g;
861 $text =~ s/([^\\])\\\)/$1\}/g ;
862
863 # remove everything that is not between braces
864 $text =~ s/\)([^\(\)])+?\(//sg ;
865
866 # remove any Trailer eof stuff.
867 $text =~ s/\)[^\)]*$//sg;
868
869 ### ligatures have special characters...
870 $text =~ s/\\013/ff/g;
871 $text =~ s/\\014/fi/g;
872 $text =~ s/\\015/fl/g;
873 $text =~ s/\\016/ffi/g;
874 $text =~ s/\\214/fi/g;
875 $text =~ s/\\215/fl/g;
876 $text =~ s/\\017/\n\* /g; # asterisk?
877 $text =~ s/\\023/\023/g; # e acute ('e)
878 $text =~ s/\\177/\252/g; # u"
879# $text =~ s/ ?? /\344/g; # a"
880
881 print OUT "$text";
882 close OUT;
883 }
884 # wrap the text - use a minimum length. ie, first space after this length.
885 my $wrap_length=72;
886 &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
887 open INFILE, "$output_filestem.text.tmp" ||
888 die "Couldn't open file: $!";
889 open OUTFILE, ">$output_filestem.text" ||
890 die "Couldn't open file for writing: $!";
891 my $line="";
892 while ($line=<INFILE>) {
893 while (length($line)>0) {
894 if (length($line)>$wrap_length) {
895 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
896 print OUTFILE "$1\n";
897 } else {
898 print OUTFILE "$line";
899 $line="";
900 }
901 }
902 }
903 close INFILE;
904 close OUTFILE;
905 &util::rm("$output_filestem.text.tmp");
906
907 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
908 return 1;
909}
910
911
912# Convert any file to HTML with a crude perl implementation of the
913# UNIX strings command.
914
915sub any_to_html {
916 ($input_filename, $output_filestem) = @_;
917
918 # First generate a text file
919 return 0 unless (&any_to_text($input_filename, $output_filestem));
920
921 # create an HTML file from the text file
922 open(TEXT, "<$output_filestem.text");
923 open(HTML, ">$output_filestem.html");
924
925 print HTML "<html><head>\n";
926 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
927 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
928 print HTML "</head><body>\n\n";
929
930 my $line;
931 while ($line=<TEXT>) {
932 $line =~ s/</&lt;/g;
933 $line =~ s/>/&gt;/g;
934 if ($line =~ /^\s*$/) {
935 print HTML "<p>";
936 } else {
937 print HTML "<br> ", $line;
938 }
939 }
940 print HTML "\n</body></html>\n";
941
942 close HTML;
943 close TEXT;
944
945 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
946 return 1;
947}
948
949# Convert any file to TEXT with a crude perl implementation of the
950# UNIX strings command.
951# Note - this assumes ascii charsets :( (jrm21)
952
953sub any_to_text {
954 ($input_filename, $output_filestem) = @_;
955
956 open(IN, "<$input_filename") || return 0;
957 binmode(IN);
958 open(OUT, ">$output_filestem.text") || return 0;
959
960 my ($line);
961 my $output_line_count = 0;
962 while (<IN>) {
963 $line = $_;
964
965 # delete anything that isn't a printable character
966 $line =~ s/[^\040-\176]+/\n/sg;
967
968 # delete any string less than 10 characters long
969 $line =~ s/^.{0,9}$/\n/mg;
970 while ($line =~ /^.{1,9}$/m) {
971 $line =~ s/^.{0,9}$/\n/mg;
972 $line =~ s/\n+/\n/sg;
973 }
974
975 # remove extraneous whitespace
976 $line =~ s/\n+/\n/gs;
977 $line =~ s/^\n//gs;
978
979 # output whatever is left
980 if ($line =~ /[^\n ]/) {
981 print OUT $line;
982 ++$output_line_count;
983 }
984 }
985
986 close OUT;
987 close IN;
988
989 if ($output_line_count) { # try to protect against binary only formats
990 return 1;
991 }
992
993 &util::rm("$output_filestem.text");
994 return 0;
995
996}
Note: See TracBrowser for help on using the repository browser.