source: trunk/gsdl/bin/script/gsConvert.pl@ 2977

Last change on this file since 2977 was 2977, checked in by jrm21, 22 years ago

added infrastructure for calling an external powerpoint to html converter.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 25.4 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert Microsoft Word, RTF, Adobe PDF and PostScript
35# using specialised conversion utilities. We can try to convert any file to
36# text with a perl implementation of the UNIX strings command.
37#
38# We try to convert Postscript files to text using "gs" which is often on
39# *nix machines. We fall back to performing weak text extraction by using
40# regular expressions.
41
42BEGIN {
43 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
44 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
45}
46
47use parsargv;
48use util;
49use Cwd;
50use File::Basename;
51
52# Are we running on WinNT or Win2000 (or later)?
53my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
54if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
55
56sub print_usage
57{
58 print STDERR "\n";
59 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
60 print STDERR " or text using third-party programs.\n\n";
61 print STDERR " usage: $0 [options] filename\n";
62 print STDERR " options:\n\t-type\tdoc|pdf|ps|ppt|rtf\t(input file type)\n";
63 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
64 print STDERR "\t-output\thtml|text\n";
65 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
66 exit(1);
67}
68
69my $faillogfile="";
70
71sub main
72{
73 my (@ARGV) = @_;
74 my ($input_type,$output_type,$verbose,$timeout);
75
76 $timeout = 0;
77 # read command-line arguments
78 if (!parsargv::parse(\@ARGV,
79 'type/(doc|pdf|ps|ppt|rtf)/', \$input_type,
80 '/errlog/.*/', \$faillogfile,
81 'output/(html|text)/', \$output_type,
82 'timeout/\d+/0',\$timeout,
83 'verbose/\d+/0', \$verbose))
84 {
85 print_usage();
86 }
87
88 # Make sure the input file exists and can be opened for reading
89 if (scalar(@ARGV!=1)) {
90 print_usage();
91 }
92
93 my $input_filename = $ARGV[0];
94 if (!-r $input_filename) {
95 print STDERR "Error: unable to open $input_filename for reading\n";
96 exit(1);
97 }
98
99 # Deduce filenames
100 my ($tailname,$dirname,$suffix)
101 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
102 my $output_filestem = &util::filename_cat($dirname, "$tailname");
103
104 if ($input_type eq "")
105 {
106 $input_type = lc (substr($suffix,1,length($suffix)-1));
107 }
108
109 # Change to temporary working directory
110 my $stored_dir = cwd();
111 chdir ($dirname) || die "Unable to change to directory $dirname";
112
113 # Select convert utility
114 if (!defined $input_type) {
115 print STDERR "Error: No filename extension or input type defined\n";
116 exit(1);
117 }
118 elsif ($input_type eq "doc") {
119 print &convertDOC($input_filename, $output_filestem, $output_type);
120 print "\n";
121 }
122 elsif ($input_type eq "rtf") {
123 print &convertRTF($input_filename, $output_filestem, $output_type);
124 print "\n";
125 }
126 elsif ($input_type eq "pdf") {
127 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
128 print "\n";
129 }
130 elsif ($input_type eq "ps") {
131 print &convertPS($input_filename, $output_filestem, $output_type);
132 print "\n";
133 }
134 elsif ($input_type eq "ppt") {
135 print &convertPPT($input_filename, $output_filestem, $output_type);
136 print "\n";
137 }
138 else {
139 print STDERR "Error: Unable to convert type '$input_type'\n";
140 exit(1);
141 }
142
143 # restore to original working directory
144 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
145
146}
147
148&main(@ARGV);
149
150
151
152# Document-type conversion functions
153#
154# The following functions attempt to convert documents from their
155# input type to the specified output type. If no output type was
156# given, then they first attempt HTML, and then TEXT.
157#
158# Each returns the output type ("html" or "text") or "fail" if no
159# conversion is possible.
160
161# Convert a Microsoft word document
162
163sub convertDOC {
164 ($input_filename, $output_filestem, $output_type) = @_;
165
166 # Many .doc files are not in fact word documents!
167 my $realtype = &find_docfile_type($input_filename);
168
169 if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
170 return &convertWord678($input_filename, $output_filestem, $output_type);
171 } elsif ($realtype eq "rtf") {
172 return &convertRTF($input_filename, $output_filestem, $output_type);
173 } else {
174 return &convertAnything($input_filename, $output_filestem, $output_type);
175 }
176}
177
178# Convert a Microsoft word 6/7/8 document
179
180sub convertWord678 {
181 ($input_filename, $output_filestem, $output_type) = @_;
182
183 my $success = 0;
184
185 # Attempt specialised conversion to HTML
186 if (!$output_type || ($output_type =~ /html/i)) {
187 $success = &doc_to_html($input_filename, $output_filestem);
188 if ($success) {
189 return "html";
190 }
191 }
192
193 return &convertAnything($input_filename, $output_filestem, $output_type);
194}
195
196
197# Convert a Rich Text Format (RTF) file
198
199sub convertRTF {
200 ($input_filename, $output_filestem, $output_type) = @_;
201
202 my $success = 0;
203
204 # Attempt specialised conversion to HTML
205 if (!$output_type || ($output_type =~ /html/i)) {
206 $success = &rtf_to_html($input_filename, $output_filestem);
207 if ($success) {
208 return "html";
209 }
210 }
211
212# rtf is so ugly that's it's not worth running strings over.
213# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
214# return &convertAnything($input_filename, $output_filestem, $output_type);
215 return "fail";
216}
217
218
219# Convert an unidentified file
220
221sub convertAnything {
222 ($input_filename, $output_filestem, $output_type) = @_;
223
224 my $success = 0;
225
226 # Attempt simple conversion to HTML
227 if (!$output_type || ($output_type =~ /html/i)) {
228 $success = &any_to_html($input_filename, $output_filestem);
229 if ($success) {
230 return "html";
231 }
232 }
233
234 # Convert to text
235 if (!$output_type || ($output_type =~ /text/i)) {
236 $success = &any_to_text($input_filename, $output_filestem);
237 if ($success) {
238 return "text";
239 }
240 }
241 return "fail";
242}
243
244
245
246# Convert an Adobe PDF document
247
248sub convertPDF {
249 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
250
251 my $success = 0;
252
253 # Attempt conversion to HTML
254 if (!$output_type || ($output_type =~ /html/i)) {
255 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
256 if ($success) {
257 return "html";
258 }
259 }
260
261 # Attempt conversion to TEXT
262 if (!$output_type || ($output_type =~ /text/i)) {
263 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
264 if ($success) {
265 return "text";
266 }
267 }
268
269 return "fail";
270
271}
272
273
274# Convert an Adobe PostScript document
275
276sub convertPS {
277 ($input_filename, $output_filestem, $output_type) = @_;
278
279 my $success = 0;
280
281 # Attempt conversion to TEXT
282 if (!$output_type || ($output_type =~ /text/i)) {
283 $success = &ps_to_text($input_filename, $output_filestem);
284 if ($success) {
285 return "text";
286 }
287 }
288
289 return "fail";
290
291}
292
293
294sub convertPPT {
295 my ($input_filename, $output_filestem, $output_type) = @_;
296
297 my $success = 0;
298
299 # Attempt conversion to HTML
300 if (!$output_type || ($output_type =~ /html/i)) {
301 # formulate the command
302 $cmd = "";
303 $cmd .= "perl -S ppttohtml.pl ";
304 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
305 $cmd .= " 2>\"$output_filestem.err\""
306 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
307
308
309 # execute the command
310 $!=0;
311 if (system($cmd)!=0)
312 {
313 print STDERR "Powerpoint converter failed $!\n";
314 } else {
315 return "html";
316 }
317 }
318
319 $success = &any_to_text($input_filename, $output_filestem);
320 if ($success) {
321 return "text";
322 }
323
324 return "fail";
325}
326
327
328
329
330
331# Find the real type of a .doc file
332#
333# We seem to have a lot of files with a .doc extension that are .rtf
334# files or Word 5 files. This function attempts to tell the difference.
335
336sub find_docfile_type {
337 ($input_filename) = @_;
338
339 open(CHK, "<$input_filename");
340 binmode(CHK);
341 my $line = "";
342 my $first = 1;
343
344 while (<CHK>) {
345
346 $line = $_;
347
348 if ($first) {
349 # check to see if this is an rtf file
350 if ($line =~ /^\{\\rtf/) {
351 close(CHK);
352 return "rtf";
353 }
354 $first = 0;
355 }
356
357 # is this is a word 6/7/8 document?
358 if ($line =~ /Word\.Document\.([678])/) {
359 close(CHK);
360 return "word$1";
361 }
362
363 }
364
365 return "unknown";
366}
367
368
369
370# Specific type-to-type conversions
371#
372# Each of the following functions attempts to convert a document from
373# a specific format to another. If they succeed they return 1 and leave
374# the output document(s) in the appropriate place; if they fail they
375# return 0 and delete any working files.
376
377
378# Attempt to convert a word document to html with the wv program
379
380sub doc_to_html {
381 ($input_filename, $output_filestem) = @_;
382
383 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
384 $ENV{'GSDLOS'}, "wvWare");
385
386 # don't include path on windows (to avoid having to play about
387 # with quoting when GSDLHOME might contain spaces) but assume
388 # that the PATH is set up correctly
389 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
390
391 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
392 "packages", "wv", "wvHtml.xml");
393
394 my $cmd = "";
395 if ($timeout) {$cmd = "ulimit -t $timeout;";}
396 $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
397 $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
398
399 # redirecting STDERR is a bad idea on windows 95/98
400 $cmd .= " 2> \"$output_filestem.err\""
401 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
402
403 # execute the command
404 $!=0;
405 if (system($cmd)!=0)
406 {
407 print STDERR "Error executing wv converter:$!\n";
408 if (-s "$output_filestem.err") {
409 open (ERRFILE, "<$output_filestem.err");
410
411 my $write_to_fail_log=0;
412 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
413 {$write_to_fail_log=1;}
414
415 my $line;
416 while ($line=<ERRFILE>) {
417 if ($line =~ /\w/) {
418 print STDERR "$line";
419 print FAILLOG "$line" if ($write_to_fail_log);
420 }
421 if ($line !~ m/startup error/) {next;}
422 print STDERR " (given an invalid .DOC file?)\n";
423 print FAILLOG " (given an invalid .DOC file?)\n"
424 if ($write_to_fail_log);
425
426 } # while ERRFILE
427 close FAILLOG if ($write_to_fail_log);
428 }
429 print STDERR "Continuing...\n";
430 return 0; # we can try any_to_text
431 }
432
433 # Was the conversion successful?
434
435 if (-s "$output_filestem.html") {
436 open(TMP, "$output_filestem.html");
437 $line = <TMP>;
438 close(TMP);
439 if ($line && $line =~ /DOCTYPE HTML/) {
440 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
441 return 1;
442 }
443 }
444
445 # If here, an error of some sort occurred
446 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
447 if (-e "$output_filestem.err") {
448 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
449 open (ERRLOG,"$output_filestem.err");
450 while (<ERRLOG>) {print FAILLOG $_;}
451 close FAILLOG;
452 close ERRLOG;
453 }
454 &util::rm("$output_filestem.err");
455 }
456
457 return 0;
458}
459
460
461# Attempt to convert an RTF document to html with rtftohtml
462
463sub rtf_to_html {
464 my ($input_filename, $output_filestem) = @_;
465
466 # formulate the command
467 $cmd = "";
468 if ($timeout) {$cmd = "ulimit -t $timeout;";}
469 $cmd .= "rtftohtml";
470
471 # it automatically uses $output_filestem.html
472 $cmd .= " \"$input_filename\"";
473
474 $cmd .= " 2>\"$output_filestem.err\""
475 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
476
477
478 # execute the command
479 $!=0;
480 if (system($cmd)!=0)
481 {
482 print STDERR "Error executing rtf converter $!\n";
483 # don't currently bother printing out error log...
484 # keep going, in case it still created an HTML file...
485 }
486
487 # Was the conversion successful?
488 my $was_successful=0;
489 if (-s "$output_filestem.html") {
490 # make sure we have some content other than header
491 open (HTML, "$output_filestem.html"); # what to do if fail?
492 my $line;
493 my $past_header=0;
494 while ($line=<HTML>) {
495
496 if ($past_header == 0) {
497 if ($line =~ /<body>/) {$past_header=1;}
498 next;
499 }
500
501 $line =~ s/<[^>]+>//g;
502 if ($line =~ /\w/ && $past_header) { # we found some content...
503 $was_successful=1;
504 last;
505 }
506 }
507 close HTML;
508 }
509
510 if ($was_successful) {
511 &util::rm("$output_filestem.err")
512 if (-e "$output_filestem.err");
513 # insert the (modified) table of contents, if it exists.
514 if (-e "${output_filestem}_ToC.html") {
515 &util::mv("$output_filestem.html","$output_filestem.src");
516 my $open_failed=0;
517 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
518 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
519 open HTML, ">$output_filestem.html" || ++$open_failed;
520
521 if ($open_failed) {
522 close HTMLSRC;
523 close TOC;
524 close HTML;
525 &util::mv("$output_filestem.src","$output_filestem.html");
526 return 1;
527 }
528
529 # print out header info from src html.
530 while (($_ = <HTMLSRC>) =~ /\w/) {
531 print HTML "$_";
532 }
533
534 # print out table of contents, making links relative
535 <TOC>; <TOC>; # ignore first 2 lines
536 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
537 my $line;
538 while ($line=<TOC>) {
539 $line =~ s@</body></html>$@@ ; # only last line has this
540 # make link relative
541 $line =~ s@href=\"[^\#]+@href=\"@;
542 print HTML $line;
543 }
544 close TOC;
545
546 # rest of html src
547 while (<HTMLSRC>) {
548 print HTML $_;
549 }
550 close HTMLSRC;
551 close HTML;
552
553 &util::rm("${output_filestem}_ToC.html");
554 &util::rm("${output_filestem}.src");
555 }
556 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
557 return 1; # success
558 }
559
560 if (-e "$output_filestem.err") {
561 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
562 {
563 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
564 print FAILLOG " (rtf file might be too recent):\n";
565 open (ERRLOG, "$output_filestem.err");
566 while (<ERRLOG>) {print FAILLOG $_;}
567 close ERRLOG;
568 close FAILLOG;
569 }
570 &util::rm("$output_filestem.err");
571 }
572
573 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
574
575 return 0;
576}
577
578
579# Convert a pdf file to html with the pdftohtml command
580
581sub pdf_to_html {
582 my ($dirname, $input_filename, $output_filestem) = @_;
583
584 $cmd = "";
585 if ($timeout) {$cmd = "ulimit -t $timeout;";}
586 $cmd .= "perl -S pdftohtml.pl ";
587 $cmd .= " \"$input_filename\" \"$output_filestem\"";
588
589 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {
590 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
591 } else {
592 $cmd .= " > \"$output_filestem.err\"";
593 }
594
595 $!=0;
596
597 my $retval=system($cmd);
598 if ($retval!=0)
599 {
600 print STDERR "Error executing pdftohtml.pl";
601 if ($!) {print STDERR ": $!";}
602 print STDERR "\n";
603 }
604
605 # make sure the converter made something
606 if ($retval!=0 || ! -s "$output_filestem.html")
607 {
608 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
609 # print out the converter's std err, if any
610 if (-s "$output_filestem.err") {
611 open (ERRLOG, "$output_filestem.err") || die "$!";
612 print STDERR "pdftohtml error log:\n";
613 while (<ERRLOG>) {
614 print STDERR "$_";
615 }
616 close ERRLOG;
617 }
618 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
619 if (-e "$output_filestem.err") {
620 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
621 {
622 open (ERRLOG, "$output_filestem.err");
623 while (<ERRLOG>) {print FAILLOG $_;}
624 close ERRLOG;
625 close FAILLOG;
626 }
627 &util::rm("$output_filestem.err");
628 }
629 return 0;
630 }
631
632 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
633 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
634 return 1;
635}
636
637# Convert a PDF file to text with the pdftotext command
638
639sub pdf_to_text {
640 my ($dirname, $input_filename, $output_filestem) = @_;
641
642 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
643
644 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
645 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
646 } else {
647 $cmd .= " > \"$output_filestem.err\"";
648 }
649
650 if (system($cmd)!=0)
651 {
652 print STDERR "Error executing $cmd: $!\n";
653 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
654 }
655
656 # make sure there is some extracted text.
657 if (-e "$output_filestem.text") {
658 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
659 binmode(EXTR_TEXT); # just in case...
660 my $line="";
661 my $seen_text=0;
662 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
663 if ($line=~ /\w/) {$seen_text=1;}
664 }
665 close EXTR_TEXT;
666 if ($seen_text==0) { # no text was extracted
667 print STDERR "Error: pdftotext found no text\n";
668 &util::rm("$output_filestem.text");
669 }
670 }
671
672 # make sure the converter made something
673 if (! -s "$output_filestem.text")
674 {
675 # print out the converters std err, if any
676 if (-s "$output_filestem.err") {
677 open (ERRLOG, "$output_filestem.err") || die "$!";
678 print STDERR "pdftotext error log:\n";
679 while (<ERRLOG>) {
680 print STDERR "$_";
681 }
682 close ERRLOG;
683 }
684 # does this converter create a .out file?
685 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
686 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
687 if (-e "$output_filestem.err") {
688 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
689 {
690 open (ERRLOG,"$output_filestem.err");
691 while (<ERRLOG>) {print FAILLOG $_;}
692 close ERRLOG;
693 close FAILLOG;
694 }
695 &util::rm("$output_filestem.err");
696 }
697 return 0;
698 }
699 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
700 return 1;
701}
702
703# Convert a PostScript document to text
704# note - just using "ps2ascii" isn't good enough, as it
705# returns 0 for a postscript interpreter error. ps2ascii is just
706# a wrapper to "gs" anyway, so we use that cmd here.
707
708sub ps_to_text {
709 my ($input_filename, $output_filestem) = @_;
710
711 my $error = "";
712
713 # if we're on windows we'll fall straight through without attempting
714 # to use gs
715 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
716 $error = "Windows does not support gs";
717
718 } else {
719 my $cmd = "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
720 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
721 $cmd .= " 2> $output_filestem.err";
722 $!=0;
723
724 my $retcode=system($cmd);
725 $retcode = $? >> 8; # see man perlfunc - system for this...
726 # if system returns -1 | 127 (couldn't start program), look at $! for message
727
728 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
729 elsif (! -e "$output_filestem.text") {
730 $error="did not create output file.\n";
731 }
732 else
733 { # make sure the interpreter didn't get an error. It is technically
734 # possible for the actual text to start with this, but....
735 open PSOUT, "$output_filestem.text";
736 if (<PSOUT> =~ /^Error: (.*)/) {
737 $error="interpreter error - \"$1\"";
738 }
739 close PSOUT;
740 }
741 }
742
743 if ($error ne "")
744 {
745 print STDERR "Warning: Error executing gs: $error\n";
746 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
747
748 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
749 {
750 print FAILLOG "gs - $error\n";
751 if (-e "$output_filestem.err") {
752 open(ERRLOG, "$output_filestem.err");
753 while (<ERRLOG>) {print FAILLOG $_;}
754 close ERRLOG;
755 }
756 close FAILLOG;
757 }
758 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
759
760
761 # Fine then. We'll just do a lousy job by ourselves...
762 # Based on 5-line regexp sed script found at:
763 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
764 #
765 print STDERR "Stripping text from postscript\n";
766 my $errorcode=0;
767 open (IN, "$input_filename")
768 || ($errorcode=1, warn "Couldn't read file: $!");
769 open (OUT, ">$output_filestem.text")
770 || ($errorcode=1, warn "Couldn't write file: $!");
771 if ($errorcode) {print STDERR "errors\n";return 0;}
772
773 my $text=""; # this is for whole .ps file...
774 $text = join('', <IN>); # see man perlport, under "System Resources"
775 close IN;
776
777 # Make sure this is a ps file...
778 if ($text !~ /^%!/) {
779 print STDERR "Bad postscript header: not '%!'\n";
780 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
781 {
782 print FAILLOG "Bad postscript header: not '%!'\n";
783 close FAILLOG;
784 }
785 return 0;
786 }
787
788 # if ps has Page data, then use it to delete all stuff before it.
789 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
790
791 # remove all leading non-data stuff
792 $text =~ s/^.*?\(//s;
793
794 # remove all newline chars for easier processing
795 $text =~ s/\n//g;
796
797 # Big assumption here - assume that if any co-ordinates are
798 # given, then we are at the end of a sentence.
799 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
800
801 # special characters--
802 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
803
804 # ? ps text formatting (eg italics?) ?
805 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
806 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
807 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
808 # default - remove the rest
809 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
810
811 # attempt to add whitespace between words...
812 # this is based purely on observation, and may be completely wrong...
813 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
814 # eg I notice "b(" is sometimes NOT a space if preceded by a
815 # negative number.
816 $text =~ s/\)\d+ ?b\(/\) \( /g;
817
818 # change quoted braces to brackets
819 $text =~ s/([^\\])\\\(/$1\{/g;
820 $text =~ s/([^\\])\\\)/$1\}/g ;
821
822 # remove everything that is not between braces
823 $text =~ s/\)([^\(\)])+?\(//sg ;
824
825 # remove any Trailer eof stuff.
826 $text =~ s/\)[^\)]*$//sg;
827
828 ### ligatures have special characters...
829 $text =~ s/\\013/ff/g;
830 $text =~ s/\\014/fi/g;
831 $text =~ s/\\015/fl/g;
832 $text =~ s/\\016/ffi/g;
833 $text =~ s/\\214/fi/g;
834 $text =~ s/\\215/fl/g;
835 $text =~ s/\\017/\n\* /g; # asterisk?
836 $text =~ s/\\023/\023/g; # e acute ('e)
837 $text =~ s/\\177/\252/g; # u"
838# $text =~ s/ ?? /\344/g; # a"
839
840 print OUT "$text";
841 close OUT;
842 }
843 # wrap the text - use a minimum length. ie, first space after this length.
844 my $wrap_length=72;
845 &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
846 open INFILE, "$output_filestem.text.tmp" ||
847 die "Couldn't open file: $!";
848 open OUTFILE, ">$output_filestem.text" ||
849 die "Couldn't open file for writing: $!";
850 my $line="";
851 while ($line=<INFILE>) {
852 while (length($line)>0) {
853 if (length($line)>$wrap_length) {
854 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
855 print OUTFILE "$1\n";
856 } else {
857 print OUTFILE "$line";
858 $line="";
859 }
860 }
861 }
862 close INFILE;
863 close OUTFILE;
864 &util::rm("$output_filestem.text.tmp");
865
866 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
867 return 1;
868}
869
870
871# Convert any file to HTML with a crude perl implementation of the
872# UNIX strings command.
873
874sub any_to_html {
875 ($input_filename, $output_filestem) = @_;
876
877 # First generate a text file
878 return 0 unless (&any_to_text($input_filename, $output_filestem));
879
880 # create an HTML file from the text file
881 open(TEXT, "<$output_filestem.text");
882 open(HTML, ">$output_filestem.html");
883
884 print HTML "<html><head>\n";
885 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
886 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
887 print HTML "</head><body>\n\n";
888
889 my $line;
890 while ($line=<TEXT>) {
891 $line =~ s/</&lt;/g;
892 $line =~ s/>/&gt;/g;
893 if ($line =~ /^\s*$/) {
894 print HTML "<p>";
895 } else {
896 print HTML "<br> ", $line;
897 }
898 }
899 print HTML "\n</body></html>\n";
900
901 close HTML;
902 close TEXT;
903
904 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
905 return 1;
906}
907
908# Convert any file to TEXT with a crude perl implementation of the
909# UNIX strings command.
910# Note - this assumes ascii charsets :( (jrm21)
911
912sub any_to_text {
913 ($input_filename, $output_filestem) = @_;
914
915 open(IN, "<$input_filename") || return 0;
916 binmode(IN);
917 open(OUT, ">$output_filestem.text") || return 0;
918
919 my ($line);
920 my $output_line_count = 0;
921 while (<IN>) {
922 $line = $_;
923
924 # delete anything that isn't a printable character
925 $line =~ s/[^\040-\176]+/\n/sg;
926
927 # delete any string less than 10 characters long
928 $line =~ s/^.{0,9}$/\n/mg;
929 while ($line =~ /^.{1,9}$/m) {
930 $line =~ s/^.{0,9}$/\n/mg;
931 $line =~ s/\n+/\n/sg;
932 }
933
934 # remove extraneous whitespace
935 $line =~ s/\n+/\n/gs;
936 $line =~ s/^\n//gs;
937
938 # output whatever is left
939 if ($line =~ /[^\n ]/) {
940 print OUT $line;
941 ++$output_line_count;
942 }
943 }
944
945 close OUT;
946 close IN;
947
948 if ($output_line_count) { # try to protect against binary only formats
949 return 1;
950 }
951
952 &util::rm("$output_filestem.text");
953 return 0;
954
955}
Note: See TracBrowser for help on using the repository browser.