source: tags/gsdl-2_37-distribution/gsdl/bin/script/gsConvert.pl@ 2843

Last change on this file since 2843 was 2755, checked in by jrm21, 23 years ago

import.pl now takes an option for saving file conversion failures to a log.
By default, import.pl will use <collectdir>/etc/fail.log. Currently only
the plugins based on ConvertToPlug will do this. Not yet tested on Win9X.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 24.6 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert Microsoft Word, RTF, Adobe PDF and PostScript
35# using specialised conversion utilities. We can try to convert any file to
36# text with a perl implementation of the UNIX strings command.
37#
38# We try to convert Postscript files to text using "gs" which is often on
39# *nix machines. We fall back to performing weak text extraction by using
40# regular expressions.
41
42BEGIN {
43 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
44 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
45}
46
47use parsargv;
48use util;
49use Cwd;
50use File::Basename;
51
52# Are we running on WinNT or Win2000 (or later)?
53my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
54if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
55
56sub print_usage
57{
58 print STDERR "\n";
59 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
60 print STDERR " or text using third-party programs.\n\n";
61 print STDERR " usage: $0 [options] filename\n";
62 print STDERR " options:\n\t-type\tdoc|pdf|ps|rtf\t(input file type)\n";
63 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
64 print STDERR "\t-output\thtml|text\n";
65 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
66 exit(1);
67}
68
69my $faillogfile="";
70
71sub main
72{
73 my (@ARGV) = @_;
74 my ($input_type,$output_type,$verbose,$timeout);
75
76 $timeout = 0;
77 # read command-line arguments
78 if (!parsargv::parse(\@ARGV,
79 'type/(doc|pdf|ps|rtf)/', \$input_type,
80 '/errlog/.*/', \$faillogfile,
81 'output/(html|text)/', \$output_type,
82 'timeout/\d+/0',\$timeout,
83 'verbose/\d+/0', \$verbose))
84 {
85 print_usage();
86 }
87
88 # Make sure the input file exists and can be opened for reading
89 if (scalar(@ARGV!=1)) {
90 print_usage();
91 }
92
93 my $input_filename = $ARGV[0];
94 if (!-r $input_filename) {
95 print STDERR "Error: unable to open $input_filename for reading\n";
96 exit(1);
97 }
98
99 # Deduce filenames
100 my ($tailname,$dirname,$suffix)
101 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
102 my $output_filestem = &util::filename_cat($dirname, "$tailname");
103
104 if ($input_type eq "")
105 {
106 $input_type = lc (substr($suffix,1,length($suffix)-1));
107 }
108
109 # Change to temporary working directory
110 my $stored_dir = cwd();
111 chdir ($dirname) || die "Unable to change to directory $dirname";
112
113 # Select convert utility
114 if (!defined $input_type) {
115 print STDERR "Error: No filename extension or input type defined\n";
116 exit(1);
117 }
118 elsif ($input_type eq "doc") {
119 print &convertDOC($input_filename, $output_filestem, $output_type);
120 print "\n";
121 }
122 elsif ($input_type eq "rtf") {
123 print &convertRTF($input_filename, $output_filestem, $output_type);
124 print "\n";
125 }
126 elsif ($input_type eq "pdf") {
127 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
128 print "\n";
129 }
130 elsif ($input_type eq "ps") {
131 print &convertPS($input_filename, $output_filestem, $output_type);
132 print "\n";
133 }
134 else {
135 print STDERR "Error: Unable to convert type '$input_type'\n";
136 exit(1);
137 }
138
139 # restore to original working directory
140 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
141
142}
143
144&main(@ARGV);
145
146
147
148# Document-type conversion functions
149#
150# The following functions attempt to convert documents from their
151# input type to the specified output type. If no output type was
152# given, then they first attempt HTML, and then TEXT.
153#
154# Each returns the output type ("html" or "text") or "fail" if no
155# conversion is possible.
156
157# Convert a Microsoft word document
158
159sub convertDOC {
160 ($input_filename, $output_filestem, $output_type) = @_;
161
162 # Many .doc files are not in fact word documents!
163 my $realtype = &find_docfile_type($input_filename);
164
165 if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
166 return &convertWord678($input_filename, $output_filestem, $output_type);
167 } elsif ($realtype eq "rtf") {
168 return &convertRTF($input_filename, $output_filestem, $output_type);
169 } else {
170 return &convertAnything($input_filename, $output_filestem, $output_type);
171 }
172}
173
174# Convert a Microsoft word 6/7/8 document
175
176sub convertWord678 {
177 ($input_filename, $output_filestem, $output_type) = @_;
178
179 my $success = 0;
180
181 # Attempt specialised conversion to HTML
182 if (!$output_type || ($output_type =~ /html/i)) {
183 $success = &doc_to_html($input_filename, $output_filestem);
184 if ($success) {
185 return "html";
186 }
187 }
188
189 return &convertAnything($input_filename, $output_filestem, $output_type);
190}
191
192
193# Convert a Rich Text Format (RTF) file
194
195sub convertRTF {
196 ($input_filename, $output_filestem, $output_type) = @_;
197
198 my $success = 0;
199
200 # Attempt specialised conversion to HTML
201 if (!$output_type || ($output_type =~ /html/i)) {
202 $success = &rtf_to_html($input_filename, $output_filestem);
203 if ($success) {
204 return "html";
205 }
206 }
207
208# rtf is so ugly that's it's not worth running strings over.
209# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
210# return &convertAnything($input_filename, $output_filestem, $output_type);
211 return "fail";
212}
213
214
215# Convert an unidentified file
216
217sub convertAnything {
218 ($input_filename, $output_filestem, $output_type) = @_;
219
220 my $success = 0;
221
222 # Attempt simple conversion to HTML
223 if (!$output_type || ($output_type =~ /html/i)) {
224 $success = &any_to_html($input_filename, $output_filestem);
225 if ($success) {
226 return "html";
227 }
228 }
229
230 # Convert to text
231 if (!$output_type || ($output_type =~ /text/i)) {
232 $success = &any_to_text($input_filename, $output_filestem);
233 if ($success) {
234 return "text";
235 }
236 }
237 return "fail";
238}
239
240
241
242# Convert an Adobe PDF document
243
244sub convertPDF {
245 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
246
247 my $success = 0;
248
249 # Attempt conversion to HTML
250 if (!$output_type || ($output_type =~ /html/i)) {
251 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
252 if ($success) {
253 return "html";
254 }
255 }
256
257 # Attempt conversion to TEXT
258 if (!$output_type || ($output_type =~ /text/i)) {
259 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
260 if ($success) {
261 return "text";
262 }
263 }
264
265 return "fail";
266
267}
268
269
270# Convert an Adobe PostScript document
271
272sub convertPS {
273 ($input_filename, $output_filestem, $output_type) = @_;
274
275 my $success = 0;
276
277 # Attempt conversion to TEXT
278 if (!$output_type || ($output_type =~ /text/i)) {
279 $success = &ps_to_text($input_filename, $output_filestem);
280 if ($success) {
281 return "text";
282 }
283 }
284
285 return "fail";
286
287}
288
289
290# Find the real type of a .doc file
291#
292# We seem to have a lot of files with a .doc extension that are .rtf
293# files or Word 5 files. This function attempts to tell the difference.
294
295sub find_docfile_type {
296 ($input_filename) = @_;
297
298 open(CHK, "<$input_filename");
299 binmode(CHK);
300 my $line = "";
301 my $first = 1;
302
303 while (<CHK>) {
304
305 $line = $_;
306
307 if ($first) {
308 # check to see if this is an rtf file
309 if ($line =~ /^\{\\rtf/) {
310 close(CHK);
311 return "rtf";
312 }
313 $first = 0;
314 }
315
316 # is this is a word 6/7/8 document?
317 if ($line =~ /Word\.Document\.([678])/) {
318 close(CHK);
319 return "word$1";
320 }
321
322 }
323
324 return "unknown";
325}
326
327
328
329# Specific type-to-type conversions
330#
331# Each of the following functions attempts to convert a document from
332# a specific format to another. If they succeed they return 1 and leave
333# the output document(s) in the appropriate place; if they fail they
334# return 0 and delete any working files.
335
336
337# Attempt to convert a word document to html with the wv program
338
339sub doc_to_html {
340 ($input_filename, $output_filestem) = @_;
341
342 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
343 $ENV{'GSDLOS'}, "wvWare");
344
345 # don't include path on windows (to avoid having to play about
346 # with quoting when GSDLHOME might contain spaces) but assume
347 # that the PATH is set up correctly
348 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
349
350 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
351 "packages", "wv", "wvHtml.xml");
352
353 my $cmd = "";
354 if ($timeout) {$cmd = "ulimit -t $timeout;";}
355 $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
356 $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
357
358 # redirecting STDERR is a bad idea on windows 95/98
359 $cmd .= " 2> \"$output_filestem.err\""
360 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
361
362 # execute the command
363 $!=0;
364 if (system($cmd)!=0)
365 {
366 print STDERR "Error executing wv converter:$!\n";
367 if (-s "$output_filestem.err") {
368 open (ERRFILE, "<$output_filestem.err");
369
370 my $write_to_fail_log=0;
371 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
372 {$write_to_fail_log=1;}
373
374 my $line;
375 while ($line=<ERRFILE>) {
376 if ($line =~ /\w/) {
377 print STDERR "$line";
378 print FAILLOG "$line" if ($write_to_fail_log);
379 }
380 if ($line !~ m/startup error/) {next;}
381 print STDERR " (given an invalid .DOC file?)\n";
382 print FAILLOG " (given an invalid .DOC file?)\n"
383 if ($write_to_fail_log);
384
385 } # while ERRFILE
386 close FAILLOG if ($write_to_fail_log);
387 }
388 print STDERR "Continuing...\n";
389 return 0; # we can try any_to_text
390 }
391
392 # Was the conversion successful?
393
394 if (-s "$output_filestem.html") {
395 open(TMP, "$output_filestem.html");
396 $line = <TMP>;
397 close(TMP);
398 if ($line && $line =~ /DOCTYPE HTML/) {
399 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
400 return 1;
401 }
402 }
403
404 # If here, an error of some sort occurred
405 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
406 if (-e "$output_filestem.err") {
407 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
408 open (ERRLOG,"$output_filestem.err");
409 while (<ERRLOG>) {print FAILLOG $_;}
410 close FAILLOG;
411 close ERRLOG;
412 }
413 &util::rm("$output_filestem.err");
414 }
415
416 return 0;
417}
418
419
420# Attempt to convert an RTF document to html with rtftohtml
421
422sub rtf_to_html {
423 my ($input_filename, $output_filestem) = @_;
424
425 # formulate the command
426 $cmd = "";
427 if ($timeout) {$cmd = "ulimit -t $timeout;";}
428 $cmd .= "rtftohtml";
429
430 # it automatically uses $output_filestem.html
431 $cmd .= " \"$input_filename\"";
432
433 $cmd .= " 2>\"$output_filestem.err\""
434 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
435
436
437 # execute the command
438 $!=0;
439 if (system($cmd)!=0)
440 {
441 print STDERR "Error executing rtf converter $!\n";
442 # don't currently bother printing out error log...
443 # keep going, in case it still created an HTML file...
444 }
445
446 # Was the conversion successful?
447 my $was_successful=0;
448 if (-s "$output_filestem.html") {
449 # make sure we have some content other than header
450 open (HTML, "$output_filestem.html"); # what to do if fail?
451 my $line;
452 my $past_header=0;
453 while ($line=<HTML>) {
454
455 if ($past_header == 0) {
456 if ($line =~ /<body>/) {$past_header=1;}
457 next;
458 }
459
460 $line =~ s/<[^>]+>//g;
461 if ($line =~ /\w/ && $past_header) { # we found some content...
462 $was_successful=1;
463 last;
464 }
465 }
466 close HTML;
467 }
468
469 if ($was_successful) {
470 &util::rm("$output_filestem.err")
471 if (-e "$output_filestem.err");
472 # insert the (modified) table of contents, if it exists.
473 if (-e "${output_filestem}_ToC.html") {
474 &util::mv("$output_filestem.html","$output_filestem.src");
475 my $open_failed=0;
476 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
477 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
478 open HTML, ">$output_filestem.html" || ++$open_failed;
479
480 if ($open_failed) {
481 close HTMLSRC;
482 close TOC;
483 close HTML;
484 &util::mv("$output_filestem.src","$output_filestem.html");
485 return 1;
486 }
487
488 # print out header info from src html.
489 while (($_ = <HTMLSRC>) =~ /\w/) {
490 print HTML "$_";
491 }
492
493 # print out table of contents, making links relative
494 <TOC>; <TOC>; # ignore first 2 lines
495 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
496 my $line;
497 while ($line=<TOC>) {
498 $line =~ s@</body></html>$@@ ; # only last line has this
499 # make link relative
500 $line =~ s@href=\"[^\#]+@href=\"@;
501 print HTML $line;
502 }
503 close TOC;
504
505 # rest of html src
506 while (<HTMLSRC>) {
507 print HTML $_;
508 }
509 close HTMLSRC;
510 close HTML;
511
512 &util::rm("${output_filestem}_ToC.html");
513 &util::rm("${output_filestem}.src");
514 }
515 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
516 return 1; # success
517 }
518
519 if (-e "$output_filestem.err") {
520 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
521 {
522 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
523 print FAILLOG " (rtf file might be too recent):\n";
524 open (ERRLOG, "$output_filestem.err");
525 while (<ERRLOG>) {print FAILLOG $_;}
526 close ERRLOG;
527 close FAILLOG;
528 }
529 &util::rm("$output_filestem.err");
530 }
531
532 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
533
534 return 0;
535}
536
537
538# Convert a pdf file to html with the pdftohtml command
539
540sub pdf_to_html {
541 my ($dirname, $input_filename, $output_filestem) = @_;
542
543 $cmd = "";
544 if ($timeout) {$cmd = "ulimit -t $timeout;";}
545 $cmd .= "perl -S pdftohtml.pl ";
546 $cmd .= " \"$input_filename\" \"$output_filestem\"";
547
548 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {
549 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
550 } else {
551 $cmd .= " > \"$output_filestem.err\"";
552 }
553
554 $!=0;
555
556 my $retval=system($cmd);
557 if ($retval!=0)
558 {
559 print STDERR "Error executing pdftohtml.pl";
560 if ($!) {print STDERR ": $!";}
561 print STDERR "\n";
562 }
563
564 # make sure the converter made something
565 if ($retval!=0 || ! -s "$output_filestem.html")
566 {
567 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
568 # print out the converter's std err, if any
569 if (-s "$output_filestem.err") {
570 open (ERRLOG, "$output_filestem.err") || die "$!";
571 print STDERR "pdftohtml error log:\n";
572 while (<ERRLOG>) {
573 print STDERR "$_";
574 }
575 close ERRLOG;
576 }
577 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
578 if (-e "$output_filestem.err") {
579 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
580 {
581 open (ERRLOG, "$output_filestem.err");
582 while (<ERRLOG>) {print FAILLOG $_;}
583 close ERRLOG;
584 close FAILLOG;
585 }
586 &util::rm("$output_filestem.err");
587 }
588 return 0;
589 }
590
591 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
592 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
593 return 1;
594}
595
596# Convert a PDF file to text with the pdftotext command
597
598sub pdf_to_text {
599 my ($dirname, $input_filename, $output_filestem) = @_;
600
601 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
602
603 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
604 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
605 } else {
606 $cmd .= " > \"$output_filestem.err\"";
607 }
608
609 if (system($cmd)!=0)
610 {
611 print STDERR "Error executing $cmd: $!\n";
612 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
613 }
614
615 # make sure there is some extracted text.
616 if (-e "$output_filestem.text") {
617 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
618 binmode(EXTR_TEXT); # just in case...
619 my $line="";
620 my $seen_text=0;
621 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
622 if ($line=~ /\w/) {$seen_text=1;}
623 }
624 close EXTR_TEXT;
625 if ($seen_text==0) { # no text was extracted
626 print STDERR "Error: pdftotext found no text\n";
627 &util::rm("$output_filestem.text");
628 }
629 }
630
631 # make sure the converter made something
632 if (! -s "$output_filestem.text")
633 {
634 # print out the converters std err, if any
635 if (-s "$output_filestem.err") {
636 open (ERRLOG, "$output_filestem.err") || die "$!";
637 print STDERR "pdftotext error log:\n";
638 while (<ERRLOG>) {
639 print STDERR "$_";
640 }
641 close ERRLOG;
642 }
643 # does this converter create a .out file?
644 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
645 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
646 if (-e "$output_filestem.err") {
647 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
648 {
649 open (ERRLOG,"$output_filestem.err");
650 while (<ERRLOG>) {print FAILLOG $_;}
651 close ERRLOG;
652 close FAILLOG;
653 }
654 &util::rm("$output_filestem.err");
655 }
656 return 0;
657 }
658 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
659 return 1;
660}
661
662# Convert a PostScript document to text
663# note - just using "ps2ascii" isn't good enough, as it
664# returns 0 for a postscript interpreter error. ps2ascii is just
665# a wrapper to "gs" anyway, so we use that cmd here.
666
667sub ps_to_text {
668 my ($input_filename, $output_filestem) = @_;
669
670 my $error = "";
671
672 # if we're on windows we'll fall straight through without attempting
673 # to use gs
674 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
675 $error = "Windows does not support gs";
676
677 } else {
678 my $cmd = "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
679 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
680 $cmd .= " 2> $output_filestem.err";
681 $!=0;
682
683 my $retcode=system($cmd);
684 $retcode = $? >> 8; # see man perlfunc - system for this...
685 # if system returns -1 | 127 (couldn't start program), look at $! for message
686
687 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
688 elsif (! -e "$output_filestem.text") {
689 $error="did not create output file.\n";
690 }
691 else
692 { # make sure the interpreter didn't get an error. It is technically
693 # possible for the actual text to start with this, but....
694 open PSOUT, "$output_filestem.text";
695 if (<PSOUT> =~ /^Error: (.*)/) {
696 $error="interpreter error - \"$1\"";
697 }
698 close PSOUT;
699 }
700 }
701
702 if ($error ne "")
703 {
704 print STDERR "Warning: Error executing gs: $error\n";
705 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
706
707 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
708 {
709 print FAILLOG "gs - $error\n";
710 if (-e "$output_filestem.err") {
711 open(ERRLOG, "$output_filestem.err");
712 while (<ERRLOG>) {print FAILLOG $_;}
713 close ERRLOG;
714 }
715 close FAILLOG;
716 }
717 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
718
719
720 # Fine then. We'll just do a lousy job by ourselves...
721 # Based on 5-line regexp sed script found at:
722 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
723 #
724 print STDERR "Stripping text from postscript\n";
725 my $errorcode=0;
726 open (IN, "$input_filename")
727 || ($errorcode=1, warn "Couldn't read file: $!");
728 open (OUT, ">$output_filestem.text")
729 || ($errorcode=1, warn "Couldn't write file: $!");
730 if ($errorcode) {print STDERR "errors\n";return 0;}
731
732 my $text=""; # this is for whole .ps file...
733 $text = join('', <IN>); # see man perlport, under "System Resources"
734 close IN;
735
736 # Make sure this is a ps file...
737 if ($text !~ /^%!/) {
738 print STDERR "Bad postscript header: not '%!'\n";
739 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
740 {
741 print FAILLOG "Bad postscript header: not '%!'\n";
742 close FAILLOG;
743 }
744 return 0;
745 }
746
747 # if ps has Page data, then use it to delete all stuff before it.
748 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
749
750 # remove all leading non-data stuff
751 $text =~ s/^.*?\(//s;
752
753 # remove all newline chars for easier processing
754 $text =~ s/\n//g;
755
756 # Big assumption here - assume that if any co-ordinates are
757 # given, then we are at the end of a sentence.
758 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
759
760 # special characters--
761 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
762
763 # ? ps text formatting (eg italics?) ?
764 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
765 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
766 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
767 # default - remove the rest
768 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
769
770 # attempt to add whitespace between words...
771 # this is based purely on observation, and may be completely wrong...
772 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
773 # eg I notice "b(" is sometimes NOT a space if preceded by a
774 # negative number.
775 $text =~ s/\)\d+ ?b\(/\) \( /g;
776
777 # change quoted braces to brackets
778 $text =~ s/([^\\])\\\(/$1\{/g;
779 $text =~ s/([^\\])\\\)/$1\}/g ;
780
781 # remove everything that is not between braces
782 $text =~ s/\)([^\(\)])+?\(//sg ;
783
784 # remove any Trailer eof stuff.
785 $text =~ s/\)[^\)]*$//sg;
786
787 ### ligatures have special characters...
788 $text =~ s/\\013/ff/g;
789 $text =~ s/\\014/fi/g;
790 $text =~ s/\\015/fl/g;
791 $text =~ s/\\016/ffi/g;
792 $text =~ s/\\214/fi/g;
793 $text =~ s/\\215/fl/g;
794 $text =~ s/\\017/\n\* /g; # asterisk?
795 $text =~ s/\\023/\023/g; # e acute ('e)
796 $text =~ s/\\177/\252/g; # u"
797# $text =~ s/ ?? /\344/g; # a"
798
799 print OUT "$text";
800 close OUT;
801 }
802 # wrap the text - use a minimum length. ie, first space after this length.
803 my $wrap_length=72;
804 &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
805 open INFILE, "$output_filestem.text.tmp" ||
806 die "Couldn't open file: $!";
807 open OUTFILE, ">$output_filestem.text" ||
808 die "Couldn't open file for writing: $!";
809 my $line="";
810 while ($line=<INFILE>) {
811 while (length($line)>0) {
812 if (length($line)>$wrap_length) {
813 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
814 print OUTFILE "$1\n";
815 } else {
816 print OUTFILE "$line";
817 $line="";
818 }
819 }
820 }
821 close INFILE;
822 close OUTFILE;
823 &util::rm("$output_filestem.text.tmp");
824
825 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
826 return 1;
827}
828
829
830# Convert any file to HTML with a crude perl implementation of the
831# UNIX strings command.
832
833sub any_to_html {
834 ($input_filename, $output_filestem) = @_;
835
836 # First generate a text file
837 return 0 unless (&any_to_text($input_filename, $output_filestem));
838
839 # create an HTML file from the text file
840 open(TEXT, "<$output_filestem.text");
841 open(HTML, ">$output_filestem.html");
842
843 print HTML "<html><head>\n";
844 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
845 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
846 print HTML "</head><body>\n\n";
847
848 my $line;
849 while ($line=<TEXT>) {
850 $line =~ s/</&lt;/g;
851 $line =~ s/>/&gt;/g;
852 if ($line =~ /^\s*$/) {
853 print HTML "<p>";
854 } else {
855 print HTML "<br> ", $line;
856 }
857 }
858 print HTML "\n</body></html>\n";
859
860 close HTML;
861 close TEXT;
862
863 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
864 return 1;
865}
866
867# Convert any file to TEXT with a crude perl implementation of the
868# UNIX strings command.
869# Note - this assumes ascii charsets :( (jrm21)
870
871sub any_to_text {
872 ($input_filename, $output_filestem) = @_;
873
874 open(IN, "<$input_filename") || return 0;
875 binmode(IN);
876 open(OUT, ">$output_filestem.text") || return 0;
877
878 my ($line);
879 my $output_line_count = 0;
880 while (<IN>) {
881 $line = $_;
882
883 # delete anything that isn't a printable character
884 $line =~ s/[^\040-\176]+/\n/sg;
885
886 # delete any string less than 10 characters long
887 $line =~ s/^.{0,9}$/\n/mg;
888 while ($line =~ /^.{1,9}$/m) {
889 $line =~ s/^.{0,9}$/\n/mg;
890 $line =~ s/\n+/\n/sg;
891 }
892
893 # remove extraneous whitespace
894 $line =~ s/\n+/\n/gs;
895 $line =~ s/^\n//gs;
896
897 # output whatever is left
898 if ($line =~ /[^\n ]/) {
899 print OUT $line;
900 ++$output_line_count;
901 }
902 }
903
904 close OUT;
905 close IN;
906
907 if ($output_line_count) { # try to protect against binary only formats
908 return 1;
909 }
910
911 &util::rm("$output_filestem.text");
912 return 0;
913
914}
Note: See TracBrowser for help on using the repository browser.