source: trunk/gsdl/bin/script/gsConvert.pl@ 10282

Last change on this file since 10282 was 10282, checked in by chi, 19 years ago

Modifications to allow the gsConvert either run open source coverting program or VB scripting for
certain types of document (e.g. Word, PPT...etc)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 31.3 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use parsargv;
52use util;
53use Cwd;
54use File::Basename;
55
56# Are we running on WinNT or Win2000 (or later)?
57my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
58if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
59
60my $use_strings;
61my $pdf_complex;
62my $pdf_nohidden;
63my $pdf_zoom;
64my $pdf_ignore_images;
65my $windows_scripting;
66
67sub print_usage
68{
69 print STDERR "\n";
70 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
71 print STDERR " or text using third-party programs.\n\n";
72 print STDERR " usage: $0 [options] filename\n";
73 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
74 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
75 print STDERR "\t-output\tauto|html|text|pagedimg-jpg|pagedimg-gif|pagedimg-png\t(output file type)\n";
76 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
77 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
78 print STDERR "\t-windows_scripting\tuse windows script when converting Microsoft Word and PPT via VB script\n";
79 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
80 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
81 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
82 print STDERR "\t\tconverting PDF to HTML\n";
83 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
84 print STDERR "\t\t-pdf_complex is set\n";
85 exit(1);
86}
87
88my $faillogfile="";
89my $timeout=0;
90
91sub main
92{
93 my (@ARGV) = @_;
94 my ($input_type,$output_type,$verbose);
95
96 # read command-line arguments
97 if (!parsargv::parse(\@ARGV,
98 'type/(doc|dot|pdf|ps|ppt|rtf|xls)/', \$input_type,
99 '/errlog/.*/', \$faillogfile,
100 'output/(auto|html|text|pagedimg).*/', \$output_type,
101 'timeout/\d+/0',\$timeout,
102 'verbose/\d+/0', \$verbose,
103 'use_strings', \$use_strings,
104 'windows_scripting',\$windows_scripting,
105 'pdf_complex', \$pdf_complex,
106 'pdf_ignore_images', \$pdf_ignore_images,
107 'pdf_nohidden', \$pdf_nohidden,
108 'pdf_zoom/\d+/2', \$pdf_zoom
109 ))
110 {
111 print_usage();
112 }
113
114
115 # Make sure the input file exists and can be opened for reading
116 if (scalar(@ARGV!=1)) {
117 print_usage();
118 }
119
120 my $input_filename = $ARGV[0];
121 if (!-r $input_filename) {
122 print STDERR "Error: unable to open $input_filename for reading\n";
123 exit(1);
124 }
125
126 # Deduce filenames
127 my ($tailname,$dirname,$suffix)
128 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
129 my $output_filestem = &util::filename_cat($dirname, "$tailname");
130
131 if ($input_type eq "")
132 {
133 $input_type = lc (substr($suffix,1,length($suffix)-1));
134 }
135
136 # Change to temporary working directory
137 my $stored_dir = cwd();
138 chdir ($dirname) || die "Unable to change to directory $dirname";
139 # Select convert utility
140 if (!defined $input_type) {
141 print STDERR "Error: No filename extension or input type defined\n";
142 exit(1);
143 }
144 elsif ($input_type eq "doc" || $input_type eq "dot") {
145 print &convertDOC($input_filename, $output_filestem, $output_type);
146 print "\n";
147 }
148 elsif ($input_type eq "rtf") {
149 print &convertRTF($input_filename, $output_filestem, $output_type);
150 print "\n";
151 }
152 elsif ($input_type eq "pdf") {
153 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
154 print "\n";
155 }
156 elsif ($input_type eq "ps") {
157 print &convertPS($input_filename, $output_filestem, $output_type);
158 print "\n";
159 }
160 elsif ($input_type eq "ppt") {
161 print &convertPPT($input_filename, $output_filestem, $output_type);
162 print "\n";
163 }
164 elsif ($input_type eq "xls") {
165 print &convertXLS($input_filename, $output_filestem, $output_type);
166 print "\n";
167 }
168 else {
169 print STDERR "Error: Unable to convert type '$input_type'\n";
170 exit(1);
171 }
172
173 # restore to original working directory
174 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
175
176}
177
178&main(@ARGV);
179
180
181
182# Document-type conversion functions
183#
184# The following functions attempt to convert documents from their
185# input type to the specified output type. If no output type was
186# given, then they first attempt HTML, and then TEXT.
187#
188# Each returns the output type ("html" or "text") or "fail" if no
189# conversion is possible.
190
191# Convert a Microsoft word document
192
193sub convertDOC {
194 ($input_filename, $output_filestem, $output_type) = @_;
195
196 # Many .doc files are not in fact word documents!
197 my $realtype = &find_docfile_type($input_filename);
198
199 if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
200 return &convertWord678($input_filename, $output_filestem, $output_type);
201 } elsif ($realtype eq "rtf") {
202 return &convertRTF($input_filename, $output_filestem, $output_type);
203 } else {
204 return &convertAnything($input_filename, $output_filestem, $output_type);
205 }
206}
207
208# Convert a Microsoft word 6/7/8 document
209
210sub convertWord678 {
211 ($input_filename, $output_filestem, $output_type) = @_;
212
213 my $success = 0;
214 if (!$output_type || ($output_type =~ /html/i)){
215 if ($windows_scripting) {
216 print STDERR "***** Calling VB Script!\n";
217 $success = &native_doc_to_html($input_filename, $output_filestem);
218 }
219 else {
220 print STDERR "**** Calling wvWare\n";
221 $success = &doc_to_html($input_filename, $output_filestem);
222 }
223 if ($success) {
224 return "html";
225 }
226 }
227
228 # Attempt specialised conversion to HTML
229 #if (!$output_type || ($output_type =~ /html/i)) {
230# $success = &doc_to_html($input_filename, $output_filestem);
231# if ($success) {
232# return "html";
233# }
234# }
235
236 return &convertAnything($input_filename, $output_filestem, $output_type);
237}
238
239
240# Convert a Rich Text Format (RTF) file
241
242sub convertRTF {
243 ($input_filename, $output_filestem, $output_type) = @_;
244
245 my $success = 0;
246
247 # Attempt specialised conversion to HTML
248 if (!$output_type || ($output_type =~ /html/i)) {
249 $success = &rtf_to_html($input_filename, $output_filestem);
250 if ($success) {
251 return "html";
252 }
253 }
254
255# rtf is so ugly that's it's not worth running strings over.
256# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
257# return &convertAnything($input_filename, $output_filestem, $output_type);
258 return "fail";
259}
260
261
262# Convert an unidentified file
263
264sub convertAnything {
265 ($input_filename, $output_filestem, $output_type) = @_;
266
267 my $success = 0;
268
269 # Attempt simple conversion to HTML
270 if (!$output_type || ($output_type =~ /html/i)) {
271 $success = &any_to_html($input_filename, $output_filestem);
272 if ($success) {
273 return "html";
274 }
275 }
276
277 # Convert to text
278 if (!$output_type || ($output_type =~ /text/i)) {
279 $success = &any_to_text($input_filename, $output_filestem);
280 if ($success) {
281 return "text";
282 }
283 }
284 return "fail";
285}
286
287
288
289# Convert an Adobe PDF document
290
291sub convertPDF {
292 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
293
294 my $success = 0;
295
296 # Attempt conversion to HTML
297 if (!$output_type || ($output_type =~ /html/i)) {
298 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
299 if ($success) {
300 return "html";
301 }
302 }
303
304 # Attempt conversion to TEXT
305 if (!$output_type || ($output_type =~ /text/i)) {
306 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
307 if ($success) {
308 return "text";
309 }
310 }
311
312 return "fail";
313
314}
315
316
317# Convert an Adobe PostScript document
318
319sub convertPS {
320 ($input_filename, $output_filestem, $output_type) = @_;
321
322 my $success = 0;
323
324 # Attempt conversion to TEXT
325 if (!$output_type || ($output_type =~ /text/i)) {
326 $success = &ps_to_text($input_filename, $output_filestem);
327 if ($success) {
328 return "text";
329 }
330 }
331 return "fail";
332}
333
334
335sub convertPPT {
336 my ($input_filename, $output_filestem, $output_type) = @_;
337
338 my $success = 0;
339 my $ppt_convert_type = "";
340 if (!$output_type || $windows_scripting ||($output_type !~ /html/i)){
341 if ($output_type =~ /gif/i) {
342 $ppt_convert_type = "-g";
343 } elsif ($output_type =~ /jp?g/i){
344 $ppt_convert_type = "-j";
345 } elsif ($output_type =~ /png/i){
346 $ppt_convert_type = "-p";
347 }
348 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
349 $ENV{'GSDLOS'}, "pptextract");
350 $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ /^windows$/i);
351
352 $cmd = "";
353 #if ($timeout) {$cmd = "ulimit -t $timeout;";}
354 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
355 #$cmd .= "$vbScript $input_filename $output_filestem.html";
356 # if the converting directory has already existed
357 if (-d $output_filestem) {
358 print STDERR "**The conversion directory has existed\n";
359 return "item";
360 } else {
361 $cmd .= "$vbScript $ppt_convert_type $input_filename $output_filestem";
362 $cmd .= " 2>\"$output_filestem.err\""
363 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
364 if (system($cmd) !=0) {
365 print STDERR "Powerpoint VB Scripting convert failed\n";
366 } else {
367 return "item";
368 }
369 }
370 } else {
371 # Attempt conversion to HTML
372 #if (!$output_type || ($output_type =~ /html/i)) {
373 # formulate the command
374 $cmd = "";
375 $cmd .= "perl -S ppttohtml.pl ";
376 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
377 $cmd .= " 2>\"$output_filestem.err\""
378 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
379
380 # execute the command
381 $!=0;
382 if (system($cmd)!=0)
383 {
384 print STDERR "Powerpoint 95/97 converter failed $!\n";
385 } else {
386 return "html";
387 }
388 }
389
390 $success = &any_to_text($input_filename, $output_filestem);
391 if ($success) {
392 return "text";
393 }
394
395 return "fail";
396}
397
398
399sub convertXLS {
400 my ($input_filename, $output_filestem, $output_type) = @_;
401
402 my $success = 0;
403
404 # Attempt conversion to HTML
405 if (!$output_type || ($output_type =~ /html/i)) {
406 # formulate the command
407 $cmd = "";
408 $cmd .= "perl -S xlstohtml.pl ";
409 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
410 $cmd .= " 2>\"$output_filestem.err\""
411 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
412
413
414 # execute the command
415 $!=0;
416 if (system($cmd)!=0)
417 {
418 print STDERR "Excel 95/97 converter failed $!\n";
419 } else {
420 return "html";
421 }
422 }
423
424 $success = &any_to_text($input_filename, $output_filestem);
425 if ($success) {
426 return "text";
427 }
428
429 return "fail";
430}
431
432
433
434# Find the real type of a .doc file
435#
436# We seem to have a lot of files with a .doc extension that are .rtf
437# files or Word 5 files. This function attempts to tell the difference.
438sub find_docfile_type {
439 ($input_filename) = @_;
440
441 open(CHK, "<$input_filename");
442 binmode(CHK);
443 my $line = "";
444 my $first = 1;
445
446 while (<CHK>) {
447
448 $line = $_;
449
450 if ($first) {
451 # check to see if this is an rtf file
452 if ($line =~ /^\{\\rtf/) {
453 close(CHK);
454 return "rtf";
455 }
456 $first = 0;
457 }
458
459 # is this is a word 6/7/8 document?
460 if ($line =~ /Word\.Document\.([678])/) {
461 close(CHK);
462 return "word$1";
463 }
464
465 }
466
467 return "unknown";
468}
469
470
471# Specific type-to-type conversions
472#
473# Each of the following functions attempts to convert a document from
474# a specific format to another. If they succeed they return 1 and leave
475# the output document(s) in the appropriate place; if they fail they
476# return 0 and delete any working files.
477
478
479# Attempt to convert a word document to html with the wv program
480sub doc_to_html {
481 ($input_filename, $output_filestem) = @_;
482
483 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
484 $ENV{'GSDLOS'}, "wvWare");
485
486 # don't include path on windows (to avoid having to play about
487 # with quoting when GSDLHOME might contain spaces) but assume
488 # that the PATH is set up correctly
489 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
490
491 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
492 "packages", "wv", "wvHtml.xml");
493
494 my $cmd = "";
495 if ($timeout) {$cmd = "ulimit -t $timeout;";}
496 $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
497 $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
498
499 # redirecting STDERR is a bad idea on windows 95/98
500 $cmd .= " 2> \"$output_filestem.err\""
501 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
502
503 # execute the command
504 $!=0;
505 if (system($cmd)!=0)
506 {
507 print STDERR "Error executing wv converter:$!\n";
508 if (-s "$output_filestem.err") {
509 open (ERRFILE, "<$output_filestem.err");
510
511 my $write_to_fail_log=0;
512 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
513 {$write_to_fail_log=1;}
514
515 my $line;
516 while ($line=<ERRFILE>) {
517 if ($line =~ /\w/) {
518 print STDERR "$line";
519 print FAILLOG "$line" if ($write_to_fail_log);
520 }
521 if ($line !~ m/startup error/) {next;}
522 print STDERR " (given an invalid .DOC file?)\n";
523 print FAILLOG " (given an invalid .DOC file?)\n"
524 if ($write_to_fail_log);
525
526 } # while ERRFILE
527 close FAILLOG if ($write_to_fail_log);
528 }
529 return 0; # we can try any_to_text
530 }
531
532 # Was the conversion successful?
533
534 if (-s "$output_filestem.html") {
535 open(TMP, "$output_filestem.html");
536 $line = <TMP>;
537 close(TMP);
538 if ($line && $line =~ /DOCTYPE HTML/) {
539 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
540 return 1;
541 }
542 }
543
544 # If here, an error of some sort occurred
545 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
546 if (-e "$output_filestem.err") {
547 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
548 open (ERRLOG,"$output_filestem.err");
549 while (<ERRLOG>) {print FAILLOG $_;}
550 close FAILLOG;
551 close ERRLOG;
552 }
553 &util::rm("$output_filestem.err");
554 }
555
556 return 0;
557}
558
559# Attempt to convert a word document to html with the word2html scripting program
560sub native_doc_to_html {
561 ($input_filename, $output_filestem) = @_;
562
563 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
564 $ENV{'GSDLOS'}, "word2html");
565
566 $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ /^windows$/i);
567
568 my $cmd = "";
569 if ($timeout) {$cmd = "ulimit -t $timeout;";}
570 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
571 $cmd .= "$vbScript $input_filename $output_filestem.html";
572
573 # redirecting STDERR
574 $cmd .= " 2> \"$output_filestem.err\""
575 if ($ENV {'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
576
577 # execute the command
578 $!=0;
579 if (system($cmd)!=0)
580 {
581 print STDERR "Error executing word2Html converter:$!\n";
582 if (-s "$output_filestem.err") {
583 open (ERRFILE, "<$output_filestem.err");
584
585 my $write_to_fail_log=0;
586 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
587 {$write_to_fail_log=1;}
588
589 my $line;
590 while ($line=<ERRFILE>) {
591 if ($line =~ /\w/) {
592 print STDERR "$line";
593 print FAILLOG "$line" if ($write_to_fail_log);
594 }
595 if ($line !~ m/startup error/) {next;}
596 print STDERR " (given an invalid .DOC file?)\n";
597 print FAILLOG " (given an invalid .DOC file?)\n"
598 if ($write_to_fail_log);
599
600 } # while ERRFILE
601 close FAILLOG if ($write_to_fail_log);
602 }
603 return 0; # we can try any_to_text
604 }
605
606 # Was the conversion successful?
607 if (-s "$output_filestem.html") {
608 open(TMP, "$output_filestem.html");
609 $line = <TMP>;
610 close(TMP);
611 if ($line && $line =~ /html/) {
612 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
613 return 1;
614 }
615 }
616
617 # If here, an error of some sort occurred
618 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
619 if (-e "$output_filestem.err") {
620 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
621 open (ERRLOG,"$output_filestem.err");
622 while (<ERRLOG>) {print FAILLOG $_;}
623 close FAILLOG;
624 close ERRLOG;
625 }
626 &util::rm("$output_filestem.err");
627 }
628 return 0;
629}
630
631
632
633# Attempt to convert an RTF document to html with rtftohtml
634
635sub rtf_to_html {
636 my ($input_filename, $output_filestem) = @_;
637
638 # formulate the command
639 $cmd = "";
640 if ($timeout) {$cmd = "ulimit -t $timeout;";}
641 $cmd .= "rtftohtml";
642 #$cmd .= "rtf-converter";
643
644 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
645
646 $cmd .= " 2>\"$output_filestem.err\""
647 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
648
649
650 # execute the command
651 $!=0;
652 if (system($cmd)!=0)
653 {
654 print STDERR "Error executing rtf converter $!\n";
655 # don't currently bother printing out error log...
656 # keep going, in case it still created an HTML file...
657 }
658
659 # Was the conversion successful?
660 my $was_successful=0;
661 if (-s "$output_filestem.html") {
662 # make sure we have some content other than header
663 open (HTML, "$output_filestem.html"); # what to do if fail?
664 my $line;
665 my $past_header=0;
666 while ($line=<HTML>) {
667
668 if ($past_header == 0) {
669 if ($line =~ /<body>/) {$past_header=1;}
670 next;
671 }
672
673 $line =~ s/<[^>]+>//g;
674 if ($line =~ /\w/ && $past_header) { # we found some content...
675 $was_successful=1;
676 last;
677 }
678 }
679 close HTML;
680 }
681
682 if ($was_successful) {
683 &util::rm("$output_filestem.err")
684 if (-e "$output_filestem.err");
685 # insert the (modified) table of contents, if it exists.
686 if (-e "${output_filestem}_ToC.html") {
687 &util::mv("$output_filestem.html","$output_filestem.src");
688 my $open_failed=0;
689 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
690 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
691 open HTML, ">$output_filestem.html" || ++$open_failed;
692
693 if ($open_failed) {
694 close HTMLSRC;
695 close TOC;
696 close HTML;
697 &util::mv("$output_filestem.src","$output_filestem.html");
698 return 1;
699 }
700
701 # print out header info from src html.
702 while (defined($_ = <HTMLSRC>) && $_ =~ /\w/) {
703 print HTML "$_";
704 }
705
706 # print out table of contents, making links relative
707 <TOC>; <TOC>; # ignore first 2 lines
708 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
709 my $line;
710 while ($line=<TOC>) {
711 $line =~ s@</body></html>$@@ ; # only last line has this
712 # make link relative
713 $line =~ s@href=\"[^\#]+@href=\"@;
714 print HTML $line;
715 }
716 close TOC;
717
718 # rest of html src
719 while (<HTMLSRC>) {
720 print HTML $_;
721 }
722 close HTMLSRC;
723 close HTML;
724
725 &util::rm("${output_filestem}_ToC.html");
726 &util::rm("${output_filestem}.src");
727 }
728 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
729 return 1; # success
730 }
731
732 if (-e "$output_filestem.err") {
733 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
734 {
735 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
736 #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
737 print FAILLOG " (rtf file might be too recent):\n";
738 open (ERRLOG, "$output_filestem.err");
739 while (<ERRLOG>) {print FAILLOG $_;}
740 close ERRLOG;
741 close FAILLOG;
742 }
743 &util::rm("$output_filestem.err");
744 }
745
746 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
747
748 return 0;
749}
750
751
752# Convert a pdf file to html with the pdftohtml command
753
754sub pdf_to_html {
755 my ($dirname, $input_filename, $output_filestem) = @_;
756
757 $cmd = "";
758 if ($timeout) {$cmd = "ulimit -t $timeout;";}
759 $cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
760 $cmd .= " -c" if ($pdf_complex);
761 $cmd .= " -i" if ($pdf_ignore_images);
762 $cmd .= " -hidden" unless ($pdf_nohidden);
763 $cmd .= " \"$input_filename\" \"$output_filestem\"";
764
765 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {
766 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
767 } else {
768 $cmd .= " > \"$output_filestem.err\"";
769 }
770
771 $!=0;
772
773 my $retval=system($cmd);
774 if ($retval!=0)
775 {
776 print STDERR "Error executing pdftohtml.pl";
777 if ($!) {print STDERR ": $!";}
778 print STDERR "\n";
779 }
780
781 # make sure the converter made something
782 if ($retval!=0 || ! -s "$output_filestem.html")
783 {
784 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
785 # print out the converter's std err, if any
786 if (-s "$output_filestem.err") {
787 open (ERRLOG, "$output_filestem.err") || die "$!";
788 print STDERR "pdftohtml error log:\n";
789 while (<ERRLOG>) {
790 print STDERR "$_";
791 }
792 close ERRLOG;
793 }
794 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
795 if (-e "$output_filestem.err") {
796 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
797 {
798 open (ERRLOG, "$output_filestem.err");
799 while (<ERRLOG>) {print FAILLOG $_;}
800 close ERRLOG;
801 close FAILLOG;
802 }
803 &util::rm("$output_filestem.err");
804 }
805 return 0;
806 }
807
808 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
809 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
810 return 1;
811}
812
813# Convert a PDF file to text with the pdftotext command
814
815sub pdf_to_text {
816 my ($dirname, $input_filename, $output_filestem) = @_;
817
818 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
819
820 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
821 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
822 } else {
823 $cmd .= " > \"$output_filestem.err\"";
824 }
825
826 if (system($cmd)!=0)
827 {
828 print STDERR "Error executing $cmd: $!\n";
829 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
830 }
831
832 # make sure there is some extracted text.
833 if (-e "$output_filestem.text") {
834 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
835 binmode(EXTR_TEXT); # just in case...
836 my $line="";
837 my $seen_text=0;
838 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
839 if ($line=~ /\w/) {$seen_text=1;}
840 }
841 close EXTR_TEXT;
842 if ($seen_text==0) { # no text was extracted
843 print STDERR "Error: pdftotext found no text\n";
844 &util::rm("$output_filestem.text");
845 }
846 }
847
848 # make sure the converter made something
849 if (! -s "$output_filestem.text")
850 {
851 # print out the converters std err, if any
852 if (-s "$output_filestem.err") {
853 open (ERRLOG, "$output_filestem.err") || die "$!";
854 print STDERR "pdftotext error log:\n";
855 while (<ERRLOG>) {
856 print STDERR "$_";
857 }
858 close ERRLOG;
859 }
860 # does this converter create a .out file?
861 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
862 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
863 if (-e "$output_filestem.err") {
864 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
865 {
866 open (ERRLOG,"$output_filestem.err");
867 while (<ERRLOG>) {print FAILLOG $_;}
868 close ERRLOG;
869 close FAILLOG;
870 }
871 &util::rm("$output_filestem.err");
872 }
873 return 0;
874 }
875 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
876 return 1;
877}
878
879# Convert a PostScript document to text
880# note - just using "ps2ascii" isn't good enough, as it
881# returns 0 for a postscript interpreter error. ps2ascii is just
882# a wrapper to "gs" anyway, so we use that cmd here.
883
884sub ps_to_text {
885 my ($input_filename, $output_filestem) = @_;
886
887 my $error = "";
888
889 # if we're on windows we'll fall straight through without attempting
890 # to use gs
891 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
892 $error = "Windows does not support gs";
893
894 } else {
895 my $cmd = "";
896 if ($timeout) {$cmd = "ulimit -t $timeout; ";}
897 $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
898 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
899 $cmd .= " 2> $output_filestem.err";
900 $!=0;
901 my $retcode=system($cmd);
902 $retcode = $? >> 8; # see man perlfunc - system for this...
903 # if system returns -1 | 127 (couldn't start program), look at $! for message
904
905 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
906 elsif (! -e "$output_filestem.text") {
907 $error="did not create output file.\n";
908 }
909 else
910 { # make sure the interpreter didn't get an error. It is technically
911 # possible for the actual text to start with this, but....
912 open PSOUT, "$output_filestem.text";
913 if (<PSOUT> =~ /^Error: (.*)/) {
914 $error="interpreter error - \"$1\"";
915 }
916 close PSOUT;
917 }
918 }
919
920 if ($error ne "")
921 {
922 print STDERR "Warning: Error executing gs: $error\n";
923 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
924
925 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
926 {
927 print FAILLOG "gs - $error\n";
928 if (-e "$output_filestem.err") {
929 open(ERRLOG, "$output_filestem.err");
930 while (<ERRLOG>) {print FAILLOG $_;}
931 close ERRLOG;
932 }
933 close FAILLOG;
934 }
935 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
936
937
938 # Fine then. We'll just do a lousy job by ourselves...
939 # Based on 5-line regexp sed script found at:
940 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
941 #
942 print STDERR "Stripping text from postscript\n";
943 my $errorcode=0;
944 open (IN, "$input_filename")
945 || ($errorcode=1, warn "Couldn't read file: $!");
946 open (OUT, ">$output_filestem.text")
947 || ($errorcode=1, warn "Couldn't write file: $!");
948 if ($errorcode) {print STDERR "errors\n";return 0;}
949
950 my $text=""; # this is for whole .ps file...
951 $text = join('', <IN>); # see man perlport, under "System Resources"
952 close IN;
953
954 # Make sure this is a ps file...
955 if ($text !~ /^%!/) {
956 print STDERR "Bad postscript header: not '%!'\n";
957 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
958 {
959 print FAILLOG "Bad postscript header: not '%!'\n";
960 close FAILLOG;
961 }
962 return 0;
963 }
964
965 # if ps has Page data, then use it to delete all stuff before it.
966 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
967
968 # remove all leading non-data stuff
969 $text =~ s/^.*?\(//s;
970
971 # remove all newline chars for easier processing
972 $text =~ s/\n//g;
973
974 # Big assumption here - assume that if any co-ordinates are
975 # given, then we are at the end of a sentence.
976 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
977
978 # special characters--
979 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
980
981 # ? ps text formatting (eg italics?) ?
982 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
983 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
984 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
985 # default - remove the rest
986 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
987
988 # attempt to add whitespace between words...
989 # this is based purely on observation, and may be completely wrong...
990 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
991 # eg I notice "b(" is sometimes NOT a space if preceded by a
992 # negative number.
993 $text =~ s/\)\d+ ?b\(/\) \( /g;
994
995 # change quoted braces to brackets
996 $text =~ s/([^\\])\\\(/$1\{/g;
997 $text =~ s/([^\\])\\\)/$1\}/g ;
998
999 # remove everything that is not between braces
1000 $text =~ s/\)([^\(\)])+?\(//sg ;
1001
1002 # remove any Trailer eof stuff.
1003 $text =~ s/\)[^\)]*$//sg;
1004
1005 ### ligatures have special characters...
1006 $text =~ s/\\013/ff/g;
1007 $text =~ s/\\014/fi/g;
1008 $text =~ s/\\015/fl/g;
1009 $text =~ s/\\016/ffi/g;
1010 $text =~ s/\\214/fi/g;
1011 $text =~ s/\\215/fl/g;
1012 $text =~ s/\\017/\n\* /g; # asterisk?
1013 $text =~ s/\\023/\023/g; # e acute ('e)
1014 $text =~ s/\\177/\252/g; # u"
1015# $text =~ s/ ?? /\344/g; # a"
1016
1017 print OUT "$text";
1018 close OUT;
1019 }
1020 # wrap the text - use a minimum length. ie, first space after this length.
1021 my $wrap_length=72;
1022 &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1023 open INFILE, "$output_filestem.text.tmp" ||
1024 die "Couldn't open file: $!";
1025 open OUTFILE, ">$output_filestem.text" ||
1026 die "Couldn't open file for writing: $!";
1027 my $line="";
1028 while ($line=<INFILE>) {
1029 while (length($line)>0) {
1030 if (length($line)>$wrap_length) {
1031 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1032 print OUTFILE "$1\n";
1033 } else {
1034 print OUTFILE "$line";
1035 $line="";
1036 }
1037 }
1038 }
1039 close INFILE;
1040 close OUTFILE;
1041 &util::rm("$output_filestem.text.tmp");
1042
1043 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1044 return 1;
1045}
1046
1047
1048# Convert any file to HTML with a crude perl implementation of the
1049# UNIX strings command.
1050
1051sub any_to_html {
1052 ($input_filename, $output_filestem) = @_;
1053
1054 # First generate a text file
1055 return 0 unless (&any_to_text($input_filename, $output_filestem));
1056
1057 # create an HTML file from the text file
1058 open(TEXT, "<$output_filestem.text");
1059 open(HTML, ">$output_filestem.html");
1060
1061 print HTML "<html><head>\n";
1062 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1063 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1064 print HTML "</head><body>\n\n";
1065
1066 my $line;
1067 while ($line=<TEXT>) {
1068 $line =~ s/</&lt;/g;
1069 $line =~ s/>/&gt;/g;
1070 if ($line =~ /^\s*$/) {
1071 print HTML "<p>";
1072 } else {
1073 print HTML "<br> ", $line;
1074 }
1075 }
1076 print HTML "\n</body></html>\n";
1077
1078 close HTML;
1079 close TEXT;
1080
1081 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1082 return 1;
1083}
1084
1085# Convert any file to TEXT with a crude perl implementation of the
1086# UNIX strings command.
1087# Note - this assumes ascii charsets :( (jrm21)
1088
1089sub any_to_text {
1090 ($input_filename, $output_filestem) = @_;
1091
1092 if (!$use_strings) {
1093 return 0;
1094 }
1095
1096 open(IN, "<$input_filename") || return 0;
1097 binmode(IN);
1098 open(OUT, ">$output_filestem.text") || return 0;
1099
1100 my ($line);
1101 my $output_line_count = 0;
1102 while (<IN>) {
1103 $line = $_;
1104
1105 # delete anything that isn't a printable character
1106 $line =~ s/[^\040-\176]+/\n/sg;
1107
1108 # delete any string less than 10 characters long
1109 $line =~ s/^.{0,9}$/\n/mg;
1110 while ($line =~ /^.{1,9}$/m) {
1111 $line =~ s/^.{0,9}$/\n/mg;
1112 $line =~ s/\n+/\n/sg;
1113 }
1114
1115 # remove extraneous whitespace
1116 $line =~ s/\n+/\n/gs;
1117 $line =~ s/^\n//gs;
1118
1119 # output whatever is left
1120 if ($line =~ /[^\n ]/) {
1121 print OUT $line;
1122 ++$output_line_count;
1123 }
1124 }
1125
1126 close OUT;
1127 close IN;
1128
1129 if ($output_line_count) { # try to protect against binary only formats
1130 return 1;
1131 }
1132
1133 &util::rm("$output_filestem.text");
1134 return 0;
1135
1136}
Note: See TracBrowser for help on using the repository browser.