source: trunk/gsdl/bin/script/gsConvert.pl@ 3350

Last change on this file since 3350 was 3350, checked in by sjboddie, 22 years ago

Added -use_strings option to ConvertToPlug. The default behaviour for
plugins derived from ConvertToPlug (WordPlug, PDFPlug etc) is now to
exclude documents that can't be converted correctly. They won't use the
perl strings stuff to extract text unless the -use_strings option is
specified.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 26.5 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use parsargv;
52use util;
53use Cwd;
54use File::Basename;
55
56# Are we running on WinNT or Win2000 (or later)?
57my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
58if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
59
60my $use_strings;
61
62sub print_usage
63{
64 print STDERR "\n";
65 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
66 print STDERR " or text using third-party programs.\n\n";
67 print STDERR " usage: $0 [options] filename\n";
68 print STDERR " options:\n\t-type\tdoc|pdf|ps|ppt|rtf|xls\t(input file type)\n";
69 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
70 print STDERR "\t-output\thtml|text\n";
71 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
72 print STDERR "\t-use_strings\t(use strnigs to extract text if conversion fails)\n";
73 exit(1);
74}
75
76my $faillogfile="";
77
78sub main
79{
80 my (@ARGV) = @_;
81 my ($input_type,$output_type,$verbose,$timeout);
82
83 $timeout = 0;
84 # read command-line arguments
85 if (!parsargv::parse(\@ARGV,
86 'type/(doc|pdf|ps|ppt|rtf|xls)/', \$input_type,
87 '/errlog/.*/', \$faillogfile,
88 'output/(html|text)/', \$output_type,
89 'timeout/\d+/0',\$timeout,
90 'verbose/\d+/0', \$verbose,
91 'use_strings', \$use_strings))
92 {
93 print_usage();
94 }
95
96 # Make sure the input file exists and can be opened for reading
97 if (scalar(@ARGV!=1)) {
98 print_usage();
99 }
100
101 my $input_filename = $ARGV[0];
102 if (!-r $input_filename) {
103 print STDERR "Error: unable to open $input_filename for reading\n";
104 exit(1);
105 }
106
107 # Deduce filenames
108 my ($tailname,$dirname,$suffix)
109 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
110 my $output_filestem = &util::filename_cat($dirname, "$tailname");
111
112 if ($input_type eq "")
113 {
114 $input_type = lc (substr($suffix,1,length($suffix)-1));
115 }
116
117 # Change to temporary working directory
118 my $stored_dir = cwd();
119 chdir ($dirname) || die "Unable to change to directory $dirname";
120
121 # Select convert utility
122 if (!defined $input_type) {
123 print STDERR "Error: No filename extension or input type defined\n";
124 exit(1);
125 }
126 elsif ($input_type eq "doc") {
127 print &convertDOC($input_filename, $output_filestem, $output_type);
128 print "\n";
129 }
130 elsif ($input_type eq "rtf") {
131 print &convertRTF($input_filename, $output_filestem, $output_type);
132 print "\n";
133 }
134 elsif ($input_type eq "pdf") {
135 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
136 print "\n";
137 }
138 elsif ($input_type eq "ps") {
139 print &convertPS($input_filename, $output_filestem, $output_type);
140 print "\n";
141 }
142 elsif ($input_type eq "ppt") {
143 print &convertPPT($input_filename, $output_filestem, $output_type);
144 print "\n";
145 }
146 elsif ($input_type eq "xls") {
147 print &convertXLS($input_filename, $output_filestem, $output_type);
148 print "\n";
149 }
150 else {
151 print STDERR "Error: Unable to convert type '$input_type'\n";
152 exit(1);
153 }
154
155 # restore to original working directory
156 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
157
158}
159
160&main(@ARGV);
161
162
163
164# Document-type conversion functions
165#
166# The following functions attempt to convert documents from their
167# input type to the specified output type. If no output type was
168# given, then they first attempt HTML, and then TEXT.
169#
170# Each returns the output type ("html" or "text") or "fail" if no
171# conversion is possible.
172
173# Convert a Microsoft word document
174
175sub convertDOC {
176 ($input_filename, $output_filestem, $output_type) = @_;
177
178 # Many .doc files are not in fact word documents!
179 my $realtype = &find_docfile_type($input_filename);
180
181 if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
182 return &convertWord678($input_filename, $output_filestem, $output_type);
183 } elsif ($realtype eq "rtf") {
184 return &convertRTF($input_filename, $output_filestem, $output_type);
185 } else {
186 return &convertAnything($input_filename, $output_filestem, $output_type);
187 }
188}
189
190# Convert a Microsoft word 6/7/8 document
191
192sub convertWord678 {
193 ($input_filename, $output_filestem, $output_type) = @_;
194
195 my $success = 0;
196
197 # Attempt specialised conversion to HTML
198 if (!$output_type || ($output_type =~ /html/i)) {
199 $success = &doc_to_html($input_filename, $output_filestem);
200 if ($success) {
201 return "html";
202 }
203 }
204
205 return &convertAnything($input_filename, $output_filestem, $output_type);
206}
207
208
209# Convert a Rich Text Format (RTF) file
210
211sub convertRTF {
212 ($input_filename, $output_filestem, $output_type) = @_;
213
214 my $success = 0;
215
216 # Attempt specialised conversion to HTML
217 if (!$output_type || ($output_type =~ /html/i)) {
218 $success = &rtf_to_html($input_filename, $output_filestem);
219 if ($success) {
220 return "html";
221 }
222 }
223
224# rtf is so ugly that's it's not worth running strings over.
225# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
226# return &convertAnything($input_filename, $output_filestem, $output_type);
227 return "fail";
228}
229
230
231# Convert an unidentified file
232
233sub convertAnything {
234 ($input_filename, $output_filestem, $output_type) = @_;
235
236 my $success = 0;
237
238 # Attempt simple conversion to HTML
239 if (!$output_type || ($output_type =~ /html/i)) {
240 $success = &any_to_html($input_filename, $output_filestem);
241 if ($success) {
242 return "html";
243 }
244 }
245
246 # Convert to text
247 if (!$output_type || ($output_type =~ /text/i)) {
248 $success = &any_to_text($input_filename, $output_filestem);
249 if ($success) {
250 return "text";
251 }
252 }
253 return "fail";
254}
255
256
257
258# Convert an Adobe PDF document
259
260sub convertPDF {
261 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
262
263 my $success = 0;
264
265 # Attempt conversion to HTML
266 if (!$output_type || ($output_type =~ /html/i)) {
267 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
268 if ($success) {
269 return "html";
270 }
271 }
272
273 # Attempt conversion to TEXT
274 if (!$output_type || ($output_type =~ /text/i)) {
275 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
276 if ($success) {
277 return "text";
278 }
279 }
280
281 return "fail";
282
283}
284
285
286# Convert an Adobe PostScript document
287
288sub convertPS {
289 ($input_filename, $output_filestem, $output_type) = @_;
290
291 my $success = 0;
292
293 # Attempt conversion to TEXT
294 if (!$output_type || ($output_type =~ /text/i)) {
295 $success = &ps_to_text($input_filename, $output_filestem);
296 if ($success) {
297 return "text";
298 }
299 }
300
301 return "fail";
302
303}
304
305
306sub convertPPT {
307 my ($input_filename, $output_filestem, $output_type) = @_;
308
309 my $success = 0;
310
311 # Attempt conversion to HTML
312 if (!$output_type || ($output_type =~ /html/i)) {
313 # formulate the command
314 $cmd = "";
315 $cmd .= "perl -S ppttohtml.pl ";
316 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
317 $cmd .= " 2>\"$output_filestem.err\""
318 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
319
320
321 # execute the command
322 $!=0;
323 if (system($cmd)!=0)
324 {
325 print STDERR "Powerpoint 95/97 converter failed $!\n";
326 } else {
327 return "html";
328 }
329 }
330
331 $success = &any_to_text($input_filename, $output_filestem);
332 if ($success) {
333 return "text";
334 }
335
336 return "fail";
337}
338
339
340sub convertXLS {
341 my ($input_filename, $output_filestem, $output_type) = @_;
342
343 my $success = 0;
344
345 # Attempt conversion to HTML
346 if (!$output_type || ($output_type =~ /html/i)) {
347 # formulate the command
348 $cmd = "";
349 $cmd .= "perl -S xlstohtml.pl ";
350 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
351 $cmd .= " 2>\"$output_filestem.err\""
352 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
353
354
355 # execute the command
356 $!=0;
357 if (system($cmd)!=0)
358 {
359 print STDERR "Excel 95/97 converter failed $!\n";
360 } else {
361 return "html";
362 }
363 }
364
365 $success = &any_to_text($input_filename, $output_filestem);
366 if ($success) {
367 return "text";
368 }
369
370 return "fail";
371}
372
373
374
375
376
377# Find the real type of a .doc file
378#
379# We seem to have a lot of files with a .doc extension that are .rtf
380# files or Word 5 files. This function attempts to tell the difference.
381
382sub find_docfile_type {
383 ($input_filename) = @_;
384
385 open(CHK, "<$input_filename");
386 binmode(CHK);
387 my $line = "";
388 my $first = 1;
389
390 while (<CHK>) {
391
392 $line = $_;
393
394 if ($first) {
395 # check to see if this is an rtf file
396 if ($line =~ /^\{\\rtf/) {
397 close(CHK);
398 return "rtf";
399 }
400 $first = 0;
401 }
402
403 # is this is a word 6/7/8 document?
404 if ($line =~ /Word\.Document\.([678])/) {
405 close(CHK);
406 return "word$1";
407 }
408
409 }
410
411 return "unknown";
412}
413
414
415
416# Specific type-to-type conversions
417#
418# Each of the following functions attempts to convert a document from
419# a specific format to another. If they succeed they return 1 and leave
420# the output document(s) in the appropriate place; if they fail they
421# return 0 and delete any working files.
422
423
424# Attempt to convert a word document to html with the wv program
425
426sub doc_to_html {
427 ($input_filename, $output_filestem) = @_;
428
429 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
430 $ENV{'GSDLOS'}, "wvWare");
431
432 # don't include path on windows (to avoid having to play about
433 # with quoting when GSDLHOME might contain spaces) but assume
434 # that the PATH is set up correctly
435 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
436
437 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
438 "packages", "wv", "wvHtml.xml");
439
440 my $cmd = "";
441 if ($timeout) {$cmd = "ulimit -t $timeout;";}
442 $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
443 $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
444
445 # redirecting STDERR is a bad idea on windows 95/98
446 $cmd .= " 2> \"$output_filestem.err\""
447 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
448
449 # execute the command
450 $!=0;
451 if (system($cmd)!=0)
452 {
453 print STDERR "Error executing wv converter:$!\n";
454 if (-s "$output_filestem.err") {
455 open (ERRFILE, "<$output_filestem.err");
456
457 my $write_to_fail_log=0;
458 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
459 {$write_to_fail_log=1;}
460
461 my $line;
462 while ($line=<ERRFILE>) {
463 if ($line =~ /\w/) {
464 print STDERR "$line";
465 print FAILLOG "$line" if ($write_to_fail_log);
466 }
467 if ($line !~ m/startup error/) {next;}
468 print STDERR " (given an invalid .DOC file?)\n";
469 print FAILLOG " (given an invalid .DOC file?)\n"
470 if ($write_to_fail_log);
471
472 } # while ERRFILE
473 close FAILLOG if ($write_to_fail_log);
474 }
475 return 0; # we can try any_to_text
476 }
477
478 # Was the conversion successful?
479
480 if (-s "$output_filestem.html") {
481 open(TMP, "$output_filestem.html");
482 $line = <TMP>;
483 close(TMP);
484 if ($line && $line =~ /DOCTYPE HTML/) {
485 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
486 return 1;
487 }
488 }
489
490 # If here, an error of some sort occurred
491 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
492 if (-e "$output_filestem.err") {
493 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
494 open (ERRLOG,"$output_filestem.err");
495 while (<ERRLOG>) {print FAILLOG $_;}
496 close FAILLOG;
497 close ERRLOG;
498 }
499 &util::rm("$output_filestem.err");
500 }
501
502 return 0;
503}
504
505
506# Attempt to convert an RTF document to html with rtftohtml
507
508sub rtf_to_html {
509 my ($input_filename, $output_filestem) = @_;
510
511 # formulate the command
512 $cmd = "";
513 if ($timeout) {$cmd = "ulimit -t $timeout;";}
514 $cmd .= "rtftohtml";
515
516 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
517
518 $cmd .= " 2>\"$output_filestem.err\""
519 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
520
521
522 # execute the command
523 $!=0;
524 if (system($cmd)!=0)
525 {
526 print STDERR "Error executing rtf converter $!\n";
527 # don't currently bother printing out error log...
528 # keep going, in case it still created an HTML file...
529 }
530
531 # Was the conversion successful?
532 my $was_successful=0;
533 if (-s "$output_filestem.html") {
534 # make sure we have some content other than header
535 open (HTML, "$output_filestem.html"); # what to do if fail?
536 my $line;
537 my $past_header=0;
538 while ($line=<HTML>) {
539
540 if ($past_header == 0) {
541 if ($line =~ /<body>/) {$past_header=1;}
542 next;
543 }
544
545 $line =~ s/<[^>]+>//g;
546 if ($line =~ /\w/ && $past_header) { # we found some content...
547 $was_successful=1;
548 last;
549 }
550 }
551 close HTML;
552 }
553
554 if ($was_successful) {
555 &util::rm("$output_filestem.err")
556 if (-e "$output_filestem.err");
557 # insert the (modified) table of contents, if it exists.
558 if (-e "${output_filestem}_ToC.html") {
559 &util::mv("$output_filestem.html","$output_filestem.src");
560 my $open_failed=0;
561 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
562 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
563 open HTML, ">$output_filestem.html" || ++$open_failed;
564
565 if ($open_failed) {
566 close HTMLSRC;
567 close TOC;
568 close HTML;
569 &util::mv("$output_filestem.src","$output_filestem.html");
570 return 1;
571 }
572
573 # print out header info from src html.
574 while (($_ = <HTMLSRC>) =~ /\w/) {
575 print HTML "$_";
576 }
577
578 # print out table of contents, making links relative
579 <TOC>; <TOC>; # ignore first 2 lines
580 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
581 my $line;
582 while ($line=<TOC>) {
583 $line =~ s@</body></html>$@@ ; # only last line has this
584 # make link relative
585 $line =~ s@href=\"[^\#]+@href=\"@;
586 print HTML $line;
587 }
588 close TOC;
589
590 # rest of html src
591 while (<HTMLSRC>) {
592 print HTML $_;
593 }
594 close HTMLSRC;
595 close HTML;
596
597 &util::rm("${output_filestem}_ToC.html");
598 &util::rm("${output_filestem}.src");
599 }
600 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
601 return 1; # success
602 }
603
604 if (-e "$output_filestem.err") {
605 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
606 {
607 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
608 print FAILLOG " (rtf file might be too recent):\n";
609 open (ERRLOG, "$output_filestem.err");
610 while (<ERRLOG>) {print FAILLOG $_;}
611 close ERRLOG;
612 close FAILLOG;
613 }
614 &util::rm("$output_filestem.err");
615 }
616
617 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
618
619 return 0;
620}
621
622
623# Convert a pdf file to html with the pdftohtml command
624
625sub pdf_to_html {
626 my ($dirname, $input_filename, $output_filestem) = @_;
627
628 $cmd = "";
629 if ($timeout) {$cmd = "ulimit -t $timeout;";}
630 $cmd .= "perl -S pdftohtml.pl ";
631 $cmd .= " \"$input_filename\" \"$output_filestem\"";
632
633 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {
634 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
635 } else {
636 $cmd .= " > \"$output_filestem.err\"";
637 }
638
639 $!=0;
640
641 my $retval=system($cmd);
642 if ($retval!=0)
643 {
644 print STDERR "Error executing pdftohtml.pl";
645 if ($!) {print STDERR ": $!";}
646 print STDERR "\n";
647 }
648
649 # make sure the converter made something
650 if ($retval!=0 || ! -s "$output_filestem.html")
651 {
652 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
653 # print out the converter's std err, if any
654 if (-s "$output_filestem.err") {
655 open (ERRLOG, "$output_filestem.err") || die "$!";
656 print STDERR "pdftohtml error log:\n";
657 while (<ERRLOG>) {
658 print STDERR "$_";
659 }
660 close ERRLOG;
661 }
662 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
663 if (-e "$output_filestem.err") {
664 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
665 {
666 open (ERRLOG, "$output_filestem.err");
667 while (<ERRLOG>) {print FAILLOG $_;}
668 close ERRLOG;
669 close FAILLOG;
670 }
671 &util::rm("$output_filestem.err");
672 }
673 return 0;
674 }
675
676 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
677 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
678 return 1;
679}
680
681# Convert a PDF file to text with the pdftotext command
682
683sub pdf_to_text {
684 my ($dirname, $input_filename, $output_filestem) = @_;
685
686 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
687
688 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
689 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
690 } else {
691 $cmd .= " > \"$output_filestem.err\"";
692 }
693
694 if (system($cmd)!=0)
695 {
696 print STDERR "Error executing $cmd: $!\n";
697 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
698 }
699
700 # make sure there is some extracted text.
701 if (-e "$output_filestem.text") {
702 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
703 binmode(EXTR_TEXT); # just in case...
704 my $line="";
705 my $seen_text=0;
706 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
707 if ($line=~ /\w/) {$seen_text=1;}
708 }
709 close EXTR_TEXT;
710 if ($seen_text==0) { # no text was extracted
711 print STDERR "Error: pdftotext found no text\n";
712 &util::rm("$output_filestem.text");
713 }
714 }
715
716 # make sure the converter made something
717 if (! -s "$output_filestem.text")
718 {
719 # print out the converters std err, if any
720 if (-s "$output_filestem.err") {
721 open (ERRLOG, "$output_filestem.err") || die "$!";
722 print STDERR "pdftotext error log:\n";
723 while (<ERRLOG>) {
724 print STDERR "$_";
725 }
726 close ERRLOG;
727 }
728 # does this converter create a .out file?
729 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
730 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
731 if (-e "$output_filestem.err") {
732 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
733 {
734 open (ERRLOG,"$output_filestem.err");
735 while (<ERRLOG>) {print FAILLOG $_;}
736 close ERRLOG;
737 close FAILLOG;
738 }
739 &util::rm("$output_filestem.err");
740 }
741 return 0;
742 }
743 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
744 return 1;
745}
746
747# Convert a PostScript document to text
748# note - just using "ps2ascii" isn't good enough, as it
749# returns 0 for a postscript interpreter error. ps2ascii is just
750# a wrapper to "gs" anyway, so we use that cmd here.
751
752sub ps_to_text {
753 my ($input_filename, $output_filestem) = @_;
754
755 my $error = "";
756
757 # if we're on windows we'll fall straight through without attempting
758 # to use gs
759 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
760 $error = "Windows does not support gs";
761
762 } else {
763 my $cmd = "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
764 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
765 $cmd .= " 2> $output_filestem.err";
766 $!=0;
767
768 my $retcode=system($cmd);
769 $retcode = $? >> 8; # see man perlfunc - system for this...
770 # if system returns -1 | 127 (couldn't start program), look at $! for message
771
772 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
773 elsif (! -e "$output_filestem.text") {
774 $error="did not create output file.\n";
775 }
776 else
777 { # make sure the interpreter didn't get an error. It is technically
778 # possible for the actual text to start with this, but....
779 open PSOUT, "$output_filestem.text";
780 if (<PSOUT> =~ /^Error: (.*)/) {
781 $error="interpreter error - \"$1\"";
782 }
783 close PSOUT;
784 }
785 }
786
787 if ($error ne "")
788 {
789 print STDERR "Warning: Error executing gs: $error\n";
790 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
791
792 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
793 {
794 print FAILLOG "gs - $error\n";
795 if (-e "$output_filestem.err") {
796 open(ERRLOG, "$output_filestem.err");
797 while (<ERRLOG>) {print FAILLOG $_;}
798 close ERRLOG;
799 }
800 close FAILLOG;
801 }
802 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
803
804
805 # Fine then. We'll just do a lousy job by ourselves...
806 # Based on 5-line regexp sed script found at:
807 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
808 #
809 print STDERR "Stripping text from postscript\n";
810 my $errorcode=0;
811 open (IN, "$input_filename")
812 || ($errorcode=1, warn "Couldn't read file: $!");
813 open (OUT, ">$output_filestem.text")
814 || ($errorcode=1, warn "Couldn't write file: $!");
815 if ($errorcode) {print STDERR "errors\n";return 0;}
816
817 my $text=""; # this is for whole .ps file...
818 $text = join('', <IN>); # see man perlport, under "System Resources"
819 close IN;
820
821 # Make sure this is a ps file...
822 if ($text !~ /^%!/) {
823 print STDERR "Bad postscript header: not '%!'\n";
824 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
825 {
826 print FAILLOG "Bad postscript header: not '%!'\n";
827 close FAILLOG;
828 }
829 return 0;
830 }
831
832 # if ps has Page data, then use it to delete all stuff before it.
833 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
834
835 # remove all leading non-data stuff
836 $text =~ s/^.*?\(//s;
837
838 # remove all newline chars for easier processing
839 $text =~ s/\n//g;
840
841 # Big assumption here - assume that if any co-ordinates are
842 # given, then we are at the end of a sentence.
843 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
844
845 # special characters--
846 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
847
848 # ? ps text formatting (eg italics?) ?
849 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
850 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
851 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
852 # default - remove the rest
853 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
854
855 # attempt to add whitespace between words...
856 # this is based purely on observation, and may be completely wrong...
857 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
858 # eg I notice "b(" is sometimes NOT a space if preceded by a
859 # negative number.
860 $text =~ s/\)\d+ ?b\(/\) \( /g;
861
862 # change quoted braces to brackets
863 $text =~ s/([^\\])\\\(/$1\{/g;
864 $text =~ s/([^\\])\\\)/$1\}/g ;
865
866 # remove everything that is not between braces
867 $text =~ s/\)([^\(\)])+?\(//sg ;
868
869 # remove any Trailer eof stuff.
870 $text =~ s/\)[^\)]*$//sg;
871
872 ### ligatures have special characters...
873 $text =~ s/\\013/ff/g;
874 $text =~ s/\\014/fi/g;
875 $text =~ s/\\015/fl/g;
876 $text =~ s/\\016/ffi/g;
877 $text =~ s/\\214/fi/g;
878 $text =~ s/\\215/fl/g;
879 $text =~ s/\\017/\n\* /g; # asterisk?
880 $text =~ s/\\023/\023/g; # e acute ('e)
881 $text =~ s/\\177/\252/g; # u"
882# $text =~ s/ ?? /\344/g; # a"
883
884 print OUT "$text";
885 close OUT;
886 }
887 # wrap the text - use a minimum length. ie, first space after this length.
888 my $wrap_length=72;
889 &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
890 open INFILE, "$output_filestem.text.tmp" ||
891 die "Couldn't open file: $!";
892 open OUTFILE, ">$output_filestem.text" ||
893 die "Couldn't open file for writing: $!";
894 my $line="";
895 while ($line=<INFILE>) {
896 while (length($line)>0) {
897 if (length($line)>$wrap_length) {
898 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
899 print OUTFILE "$1\n";
900 } else {
901 print OUTFILE "$line";
902 $line="";
903 }
904 }
905 }
906 close INFILE;
907 close OUTFILE;
908 &util::rm("$output_filestem.text.tmp");
909
910 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
911 return 1;
912}
913
914
915# Convert any file to HTML with a crude perl implementation of the
916# UNIX strings command.
917
918sub any_to_html {
919 ($input_filename, $output_filestem) = @_;
920
921 # First generate a text file
922 return 0 unless (&any_to_text($input_filename, $output_filestem));
923
924 # create an HTML file from the text file
925 open(TEXT, "<$output_filestem.text");
926 open(HTML, ">$output_filestem.html");
927
928 print HTML "<html><head>\n";
929 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
930 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
931 print HTML "</head><body>\n\n";
932
933 my $line;
934 while ($line=<TEXT>) {
935 $line =~ s/</&lt;/g;
936 $line =~ s/>/&gt;/g;
937 if ($line =~ /^\s*$/) {
938 print HTML "<p>";
939 } else {
940 print HTML "<br> ", $line;
941 }
942 }
943 print HTML "\n</body></html>\n";
944
945 close HTML;
946 close TEXT;
947
948 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
949 return 1;
950}
951
952# Convert any file to TEXT with a crude perl implementation of the
953# UNIX strings command.
954# Note - this assumes ascii charsets :( (jrm21)
955
956sub any_to_text {
957 ($input_filename, $output_filestem) = @_;
958
959 if (!$use_strings) {
960 return 0;
961 }
962
963 open(IN, "<$input_filename") || return 0;
964 binmode(IN);
965 open(OUT, ">$output_filestem.text") || return 0;
966
967 my ($line);
968 my $output_line_count = 0;
969 while (<IN>) {
970 $line = $_;
971
972 # delete anything that isn't a printable character
973 $line =~ s/[^\040-\176]+/\n/sg;
974
975 # delete any string less than 10 characters long
976 $line =~ s/^.{0,9}$/\n/mg;
977 while ($line =~ /^.{1,9}$/m) {
978 $line =~ s/^.{0,9}$/\n/mg;
979 $line =~ s/\n+/\n/sg;
980 }
981
982 # remove extraneous whitespace
983 $line =~ s/\n+/\n/gs;
984 $line =~ s/^\n//gs;
985
986 # output whatever is left
987 if ($line =~ /[^\n ]/) {
988 print OUT $line;
989 ++$output_line_count;
990 }
991 }
992
993 close OUT;
994 close IN;
995
996 if ($output_line_count) { # try to protect against binary only formats
997 return 1;
998 }
999
1000 &util::rm("$output_filestem.text");
1001 return 0;
1002
1003}
Note: See TracBrowser for help on using the repository browser.