source: trunk/gsdl/bin/script/gsConvert.pl@ 10493

Last change on this file since 10493 was 10464, checked in by chi, 19 years ago

Modifications of the if loop condition in convertPPT() to allow the different
types of convert_to can be dealt with properly.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 34.0 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use parsargv;
52use util;
53use Cwd;
54use File::Basename;
55
56# Are we running on WinNT or Win2000 (or later)?
57my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
58if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
59
60my $use_strings;
61my $pdf_complex;
62my $pdf_nohidden;
63my $pdf_zoom;
64my $pdf_ignore_images;
65my $pdf_allow_images_only;
66my $windows_scripting;
67
68sub print_usage
69{
70 print STDERR "\n";
71 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
72 print STDERR " or text using third-party programs.\n\n";
73 print STDERR " usage: $0 [options] filename\n";
74 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
75 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
76 print STDERR "\t-output\tauto|html|text|pagedimg-jpg|pagedimg-gif|pagedimg-png\t(output file type)\n";
77 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
78 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
79 print STDERR "\t-windows_scripting\tuse windows script when converting Microsoft Word and PPT via VB script\n";
80 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
81 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
82 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
83 print STDERR "\t\tconverting PDF to HTML\n";
84 print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
85 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
86 print STDERR "\t\t-pdf_complex is set\n";
87 exit(1);
88}
89
90my $faillogfile="";
91my $timeout=0;
92
93sub main
94{
95 my (@ARGV) = @_;
96 my ($input_type,$output_type,$verbose);
97
98 # read command-line arguments
99 if (!parsargv::parse(\@ARGV,
100 'type/(doc|dot|pdf|ps|ppt|rtf|xls)/', \$input_type,
101 '/errlog/.*/', \$faillogfile,
102 'output/(auto|html|text|pagedimg).*/', \$output_type,
103 'timeout/\d+/0',\$timeout,
104 'verbose/\d+/0', \$verbose,
105 'use_strings', \$use_strings,
106 'windows_scripting',\$windows_scripting,
107 'pdf_complex', \$pdf_complex,
108 'pdf_ignore_images', \$pdf_ignore_images,
109 'pdf_allow_images_only', \$pdf_allow_images_only,
110 'pdf_nohidden', \$pdf_nohidden,
111 'pdf_zoom/\d+/2', \$pdf_zoom
112 ))
113 {
114 print_usage();
115 }
116
117
118 # Make sure the input file exists and can be opened for reading
119 if (scalar(@ARGV!=1)) {
120 print_usage();
121 }
122
123 my $input_filename = $ARGV[0];
124 if (!-r $input_filename) {
125 print STDERR "Error: unable to open $input_filename for reading\n";
126 exit(1);
127 }
128
129 # Deduce filenames
130 my ($tailname,$dirname,$suffix)
131 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
132 my $output_filestem = &util::filename_cat($dirname, "$tailname");
133
134 if ($input_type eq "")
135 {
136 $input_type = lc (substr($suffix,1,length($suffix)-1));
137 }
138
139 # Change to temporary working directory
140 my $stored_dir = cwd();
141 chdir ($dirname) || die "Unable to change to directory $dirname";
142
143 # Select convert utility
144 if (!defined $input_type) {
145 print STDERR "Error: No filename extension or input type defined\n";
146 exit(1);
147 }
148 elsif ($input_type eq "doc" || $input_type eq "dot") {
149 print &convertDOC($input_filename, $output_filestem, $output_type);
150 print "\n";
151 }
152 elsif ($input_type eq "rtf") {
153 print &convertRTF($input_filename, $output_filestem, $output_type);
154 print "\n";
155 }
156 elsif ($input_type eq "pdf") {
157 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
158 print "\n";
159 }
160 elsif ($input_type eq "ps") {
161 print &convertPS($input_filename, $output_filestem, $output_type);
162 print "\n";
163 }
164 elsif ($input_type eq "ppt") {
165 print &convertPPT($input_filename, $output_filestem, $output_type);
166 print "\n";
167 }
168 elsif ($input_type eq "xls") {
169 print &convertXLS($input_filename, $output_filestem, $output_type);
170 print "\n";
171 }
172 else {
173 print STDERR "Error: Unable to convert type '$input_type'\n";
174 exit(1);
175 }
176
177 # restore to original working directory
178 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
179
180}
181
182&main(@ARGV);
183
184
185
186# Document-type conversion functions
187#
188# The following functions attempt to convert documents from their
189# input type to the specified output type. If no output type was
190# given, then they first attempt HTML, and then TEXT.
191#
192# Each returns the output type ("html" or "text") or "fail" if no
193# conversion is possible.
194
195# Convert a Microsoft word document
196
197sub convertDOC {
198 ($input_filename, $output_filestem, $output_type) = @_;
199
200 # Many .doc files are not in fact word documents!
201 my $realtype = &find_docfile_type($input_filename);
202
203 if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
204 return &convertWord678($input_filename, $output_filestem, $output_type);
205 } elsif ($realtype eq "rtf") {
206 return &convertRTF($input_filename, $output_filestem, $output_type);
207 } else {
208 return &convertAnything($input_filename, $output_filestem, $output_type);
209 }
210}
211
212# Convert a Microsoft word 6/7/8 document
213
214sub convertWord678 {
215 ($input_filename, $output_filestem, $output_type) = @_;
216
217 my $success = 0;
218 if (!$output_type || ($output_type =~ /html/i)){
219 if ($windows_scripting) {
220 $success = &native_doc_to_html($input_filename, $output_filestem);
221 }
222 else {
223 $success = &doc_to_html($input_filename, $output_filestem);
224 }
225 if ($success) {
226 return "html";
227 }
228 }
229
230 return &convertAnything($input_filename, $output_filestem, $output_type);
231}
232
233
234# Convert a Rich Text Format (RTF) file
235
236sub convertRTF {
237 ($input_filename, $output_filestem, $output_type) = @_;
238
239 my $success = 0;
240
241 # Attempt specialised conversion to HTML
242 if (!$output_type || ($output_type =~ /html/i)) {
243 $success = &rtf_to_html($input_filename, $output_filestem);
244 if ($success) {
245 return "html";
246 }
247 }
248
249# rtf is so ugly that's it's not worth running strings over.
250# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
251# return &convertAnything($input_filename, $output_filestem, $output_type);
252 return "fail";
253}
254
255
256# Convert an unidentified file
257
258sub convertAnything {
259 ($input_filename, $output_filestem, $output_type) = @_;
260
261 my $success = 0;
262
263 # Attempt simple conversion to HTML
264 if (!$output_type || ($output_type =~ /html/i)) {
265 $success = &any_to_html($input_filename, $output_filestem);
266 if ($success) {
267 return "html";
268 }
269 }
270
271 # Convert to text
272 if (!$output_type || ($output_type =~ /text/i)) {
273 $success = &any_to_text($input_filename, $output_filestem);
274 if ($success) {
275 return "text";
276 }
277 }
278 return "fail";
279}
280
281
282
283# Convert an Adobe PDF document
284
285sub convertPDF {
286 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
287
288 my $success = 0;
289 $output_type =~ s/.*\-(.*)/$1/i;
290 # Attempt coversion to Image
291 if ($output_type =~ /jp?g|gif|png/i) {
292 $success = &pdf_to_img($dirname, $input_filename, $output_filestem, $output_type);
293 if ($success){
294 return "item";
295 }
296 }
297
298 # Attempt conversion to HTML
299 if (!$output_type || ($output_type =~ /html/i)) {
300 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
301 if ($success) {
302 return "html";
303 }
304 }
305
306 # Attempt conversion to TEXT
307 if (!$output_type || ($output_type =~ /text/i)) {
308 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
309 if ($success) {
310 return "text";
311 }
312 }
313
314 return "fail";
315
316}
317
318
319# Convert an Adobe PostScript document
320
321sub convertPS {
322 ($input_filename, $output_filestem, $output_type) = @_;
323
324 my $success = 0;
325
326 # Attempt conversion to TEXT
327 if (!$output_type || ($output_type =~ /text/i)) {
328 $success = &ps_to_text($input_filename, $output_filestem);
329 if ($success) {
330 return "text";
331 }
332 }
333 return "fail";
334}
335
336
337sub convertPPT {
338 my ($input_filename, $output_filestem, $output_type) = @_;
339 my $success = 0;
340
341 my $ppt_convert_type = "";
342 #if (!$output_type || $windows_scripting ||($output_type !~ /html/i) ||($output_type !~ /text/i)){
343 if ($windows_scripting && ($output_type !~ /html/i) && ($output_type !~ /text/i)){
344 if ($output_type =~ /gif/i) {
345 $ppt_convert_type = "-g";
346 } elsif ($output_type =~ /jp?g/i){
347 $ppt_convert_type = "-j";
348 } elsif ($output_type =~ /png/i){
349 $ppt_convert_type = "-p";
350 }
351 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
352 $ENV{'GSDLOS'}, "pptextract");
353 $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ /^windows$/i);
354
355 $cmd = "";
356 if ($timeout) {$cmd = "ulimit -t $timeout;";}
357 # if the converting directory has already existed
358 if (-d $output_filestem) {
359 print STDERR "**The conversion directory has existed\n";
360 return "item";
361 } else {
362 $cmd .= "$vbScript $ppt_convert_type $input_filename $output_filestem";
363 $cmd .= " 2>\"$output_filestem.err\""
364 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
365 if (system($cmd) !=0) {
366 print STDERR "Powerpoint VB Scripting convert failed\n";
367 } else {
368 return "item";
369 }
370 }
371 } elsif (!$output_type || ($output_type =~ /html/i)) {
372 # Attempt conversion to HTML
373 #if (!$output_type || ($output_type =~ /html/i)) {
374 # formulate the command
375 $cmd = "";
376 $cmd .= "perl -S ppttohtml.pl ";
377 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
378 $cmd .= " 2>\"$output_filestem.err\""
379 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
380
381 # execute the command
382 $!=0;
383 if (system($cmd)!=0)
384 {
385 print STDERR "Powerpoint 95/97 converter failed $!\n";
386 } else {
387 return "html";
388 }
389 }
390
391 $success = &any_to_text($input_filename, $output_filestem);
392 if ($success) {
393 return "text";
394 }
395
396 return "fail";
397}
398
399
400sub convertXLS {
401 my ($input_filename, $output_filestem, $output_type) = @_;
402
403 my $success = 0;
404
405 # Attempt conversion to HTML
406 if (!$output_type || ($output_type =~ /html/i)) {
407 # formulate the command
408 $cmd = "";
409 $cmd .= "perl -S xlstohtml.pl ";
410 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
411 $cmd .= " 2>\"$output_filestem.err\""
412 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
413
414
415 # execute the command
416 $!=0;
417 if (system($cmd)!=0)
418 {
419 print STDERR "Excel 95/97 converter failed $!\n";
420 } else {
421 return "html";
422 }
423 }
424
425 $success = &any_to_text($input_filename, $output_filestem);
426 if ($success) {
427 return "text";
428 }
429
430 return "fail";
431}
432
433
434
435# Find the real type of a .doc file
436#
437# We seem to have a lot of files with a .doc extension that are .rtf
438# files or Word 5 files. This function attempts to tell the difference.
439sub find_docfile_type {
440 ($input_filename) = @_;
441
442 open(CHK, "<$input_filename");
443 binmode(CHK);
444 my $line = "";
445 my $first = 1;
446
447 while (<CHK>) {
448
449 $line = $_;
450
451 if ($first) {
452 # check to see if this is an rtf file
453 if ($line =~ /^\{\\rtf/) {
454 close(CHK);
455 return "rtf";
456 }
457 $first = 0;
458 }
459
460 # is this is a word 6/7/8 document?
461 if ($line =~ /Word\.Document\.([678])/) {
462 close(CHK);
463 return "word$1";
464 }
465
466 }
467
468 return "unknown";
469}
470
471
472# Specific type-to-type conversions
473#
474# Each of the following functions attempts to convert a document from
475# a specific format to another. If they succeed they return 1 and leave
476# the output document(s) in the appropriate place; if they fail they
477# return 0 and delete any working files.
478
479
480# Attempt to convert a word document to html with the wv program
481sub doc_to_html {
482 ($input_filename, $output_filestem) = @_;
483
484 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
485 $ENV{'GSDLOS'}, "wvWare");
486
487 # don't include path on windows (to avoid having to play about
488 # with quoting when GSDLHOME might contain spaces) but assume
489 # that the PATH is set up correctly
490 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
491
492 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
493 "packages", "wv", "wvHtml.xml");
494
495 my $cmd = "";
496 if ($timeout) {$cmd = "ulimit -t $timeout;";}
497 $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
498 $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
499
500 # redirecting STDERR is a bad idea on windows 95/98
501 $cmd .= " 2> \"$output_filestem.err\""
502 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
503 # execute the command
504 $!=0;
505 if (system($cmd)!=0)
506 {
507 print STDERR "Error executing wv converter:$!\n";
508 if (-s "$output_filestem.err") {
509 open (ERRFILE, "<$output_filestem.err");
510
511 my $write_to_fail_log=0;
512 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
513 {$write_to_fail_log=1;}
514
515 my $line;
516 while ($line=<ERRFILE>) {
517 if ($line =~ /\w/) {
518 print STDERR "$line";
519 print FAILLOG "$line" if ($write_to_fail_log);
520 }
521 if ($line !~ m/startup error/) {next;}
522 print STDERR " (given an invalid .DOC file?)\n";
523 print FAILLOG " (given an invalid .DOC file?)\n"
524 if ($write_to_fail_log);
525
526 } # while ERRFILE
527 close FAILLOG if ($write_to_fail_log);
528 }
529 return 0; # we can try any_to_text
530 }
531
532 # Was the conversion successful?
533
534 if (-s "$output_filestem.html") {
535 open(TMP, "$output_filestem.html");
536 $line = <TMP>;
537 close(TMP);
538 if ($line && $line =~ /DOCTYPE HTML/) {
539 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
540 return 1;
541 }
542 }
543
544 # If here, an error of some sort occurred
545 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
546 if (-e "$output_filestem.err") {
547 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
548 open (ERRLOG,"$output_filestem.err");
549 while (<ERRLOG>) {print FAILLOG $_;}
550 close FAILLOG;
551 close ERRLOG;
552 }
553 &util::rm("$output_filestem.err");
554 }
555
556 return 0;
557}
558
559
560# Attempt to convert a word document to html with the word2html scripting program
561sub native_doc_to_html {
562 ($input_filename, $output_filestem) = @_;
563
564 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
565 $ENV{'GSDLOS'}, "word2html");
566
567 $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ /^windows$/i);
568 if (-e "$output_filestem.html") {
569 print STDERR "*** The conversion file has existed\n";
570 return 1;
571 }
572
573 my $cmd = "";
574 if ($timeout) {$cmd = "ulimit -t $timeout;";}
575 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
576 #$cmd .= "$vbScript $input_filename $output_filestem.html";
577 $cmd .= "$vbScript $input_filename $output_filestem.html";
578
579 # redirecting STDERR
580 $cmd .= " 2> \"$output_filestem.err\""
581 if ($ENV {'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
582
583 # execute the command
584 $!=0;
585 if (system($cmd)!=0)
586 {
587 print STDERR "Error executing word2Html converter:$!\n";
588 if (-s "$output_filestem.err") {
589 open (ERRFILE, "<$output_filestem.err");
590
591 my $write_to_fail_log=0;
592 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
593 {$write_to_fail_log=1;}
594
595 my $line;
596 while ($line=<ERRFILE>) {
597 if ($line =~ /\w/) {
598 print STDERR "$line";
599 print FAILLOG "$line" if ($write_to_fail_log);
600 }
601 if ($line !~ m/startup error/) {next;}
602 print STDERR " (given an invalid .DOC file?)\n";
603 print FAILLOG " (given an invalid .DOC file?)\n"
604 if ($write_to_fail_log);
605
606 } # while ERRFILE
607 close FAILLOG if ($write_to_fail_log);
608 }
609 return 0; # we can try any_to_text
610 }
611
612 # Was the conversion successful?
613 if (-s "$output_filestem.html") {
614 open(TMP, "$output_filestem.html");
615 $line = <TMP>;
616 close(TMP);
617 if ($line && $line =~ /html/) {
618 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
619 return 1;
620 }
621 }
622
623 # If here, an error of some sort occurred
624 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
625 if (-e "$output_filestem.err") {
626 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
627 open (ERRLOG,"$output_filestem.err");
628 while (<ERRLOG>) {print FAILLOG $_;}
629 close FAILLOG;
630 close ERRLOG;
631 }
632 &util::rm("$output_filestem.err");
633 }
634 return 0;
635}
636
637# Attempt to convert an RTF document to html with rtftohtml
638
639sub rtf_to_html {
640 my ($input_filename, $output_filestem) = @_;
641
642 # formulate the command
643 $cmd = "";
644 if ($timeout) {$cmd = "ulimit -t $timeout;";}
645 $cmd .= "rtftohtml";
646 #$cmd .= "rtf-converter";
647
648 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
649
650 $cmd .= " 2>\"$output_filestem.err\""
651 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
652
653
654 # execute the command
655 $!=0;
656 if (system($cmd)!=0)
657 {
658 print STDERR "Error executing rtf converter $!\n";
659 # don't currently bother printing out error log...
660 # keep going, in case it still created an HTML file...
661 }
662
663 # Was the conversion successful?
664 my $was_successful=0;
665 if (-s "$output_filestem.html") {
666 # make sure we have some content other than header
667 open (HTML, "$output_filestem.html"); # what to do if fail?
668 my $line;
669 my $past_header=0;
670 while ($line=<HTML>) {
671
672 if ($past_header == 0) {
673 if ($line =~ /<body>/) {$past_header=1;}
674 next;
675 }
676
677 $line =~ s/<[^>]+>//g;
678 if ($line =~ /\w/ && $past_header) { # we found some content...
679 $was_successful=1;
680 last;
681 }
682 }
683 close HTML;
684 }
685
686 if ($was_successful) {
687 &util::rm("$output_filestem.err")
688 if (-e "$output_filestem.err");
689 # insert the (modified) table of contents, if it exists.
690 if (-e "${output_filestem}_ToC.html") {
691 &util::mv("$output_filestem.html","$output_filestem.src");
692 my $open_failed=0;
693 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
694 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
695 open HTML, ">$output_filestem.html" || ++$open_failed;
696
697 if ($open_failed) {
698 close HTMLSRC;
699 close TOC;
700 close HTML;
701 &util::mv("$output_filestem.src","$output_filestem.html");
702 return 1;
703 }
704
705 # print out header info from src html.
706 while (defined($_ = <HTMLSRC>) && $_ =~ /\w/) {
707 print HTML "$_";
708 }
709
710 # print out table of contents, making links relative
711 <TOC>; <TOC>; # ignore first 2 lines
712 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
713 my $line;
714 while ($line=<TOC>) {
715 $line =~ s@</body></html>$@@ ; # only last line has this
716 # make link relative
717 $line =~ s@href=\"[^\#]+@href=\"@;
718 print HTML $line;
719 }
720 close TOC;
721
722 # rest of html src
723 while (<HTMLSRC>) {
724 print HTML $_;
725 }
726 close HTMLSRC;
727 close HTML;
728
729 &util::rm("${output_filestem}_ToC.html");
730 &util::rm("${output_filestem}.src");
731 }
732 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
733 return 1; # success
734 }
735
736 if (-e "$output_filestem.err") {
737 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
738 {
739 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
740 #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
741 print FAILLOG " (rtf file might be too recent):\n";
742 open (ERRLOG, "$output_filestem.err");
743 while (<ERRLOG>) {print FAILLOG $_;}
744 close ERRLOG;
745 close FAILLOG;
746 }
747 &util::rm("$output_filestem.err");
748 }
749
750 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
751
752 return 0;
753}
754
755
756# Convert a pdf file to html with the pdftohtml command
757
758sub pdf_to_html {
759 my ($dirname, $input_filename, $output_filestem) = @_;
760
761 $cmd = "";
762 if ($timeout) {$cmd = "ulimit -t $timeout;";}
763 $cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
764 $cmd .= " -c" if ($pdf_complex);
765 $cmd .= " -i" if ($pdf_ignore_images);
766 $cmd .= " -a" if ($pdf_allow_images_only);
767 $cmd .= " -hidden" unless ($pdf_nohidden);
768 $cmd .= " \"$input_filename\" \"$output_filestem\"";
769
770 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {
771 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
772 } else {
773 $cmd .= " > \"$output_filestem.err\"";
774 }
775
776 $!=0;
777
778 my $retval=system($cmd);
779 if ($retval!=0)
780 {
781 print STDERR "Error executing pdftohtml.pl";
782 if ($!) {print STDERR ": $!";}
783 print STDERR "\n";
784 }
785
786 # make sure the converter made something
787 if ($retval!=0 || ! -s "$output_filestem.html")
788 {
789 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
790 # print out the converter's std err, if any
791 if (-s "$output_filestem.err") {
792 open (ERRLOG, "$output_filestem.err") || die "$!";
793 print STDERR "pdftohtml error log:\n";
794 while (<ERRLOG>) {
795 print STDERR "$_";
796 }
797 close ERRLOG;
798 }
799 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
800 if (-e "$output_filestem.err") {
801 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
802 {
803 open (ERRLOG, "$output_filestem.err");
804 while (<ERRLOG>) {print FAILLOG $_;}
805 close ERRLOG;
806 close FAILLOG;
807 }
808 &util::rm("$output_filestem.err");
809 }
810 return 0;
811 }
812
813 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
814 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
815 return 1;
816}
817
818# Convert a pdf file to various types of image with the convert command
819
820sub pdf_to_img {
821 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
822
823 # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
824 if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
825 my $result = `identify 2>&1`;
826 if ($? == -1 || $? == 256) { # Linux and Windows return different values for "program not found"
827 #ImageMagick is not installed, thus the convert utility is not available.
828 print STDERR "*** ImageMagick is not installed, the convert utility is not available\n";
829 return 0;
830 }
831 }
832
833 $cmd = "";
834 if ($timeout) {$cmd = "ulimit -t $timeout;";}
835 $output_type =~ s/.*\_(.*)/$1/i;
836 $cmd .= "perl -S pdftoimg.pl -convert_to $output_type $input_filename $output_filestem";
837 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {
838 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
839 } else {
840 $cmd .= " > \"$output_filestem.err\"";
841 }
842
843 # don't include path on windows (to avoid having to play about
844 # with quoting when GSDLHOME might contain spaces) but assume
845 # that the PATH is set up correctly
846 $!=0;
847 my $retval=system($cmd);
848 if ($retval!=0)
849 {
850 print STDERR "Error executing pdftoimg.pl";
851 if ($!) {print STDERR ": $!";}
852 print STDERR "\n";
853 }
854
855 #make sure the converter made something
856 #if ($retval !=0) || ! -s "$output_filestem")
857 if ($retval !=0)
858 {
859 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
860 #print out the converter's std err, if any
861 if (-s "$output_filestem.err") {
862 open (ERRLOG, "$output_filestem.err") || die "$!";
863 print STDERR "pdftoimg error log:\n";
864 while (<ERRLOG>) {
865 print STDERR "$_";
866 }
867 close ERRLOG;
868 }
869 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
870 if (-e "$output_filestem.err") {
871 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
872 {
873 open (ERRLOG, "$output_filestem.err");
874 while (<ERRLOG>) {print FAILLOG $_;}
875 close ERRLOG;
876 close FAILLOG;
877 }
878 &util::rm("$output_filestem.err");
879 }
880 return 0;
881 }
882 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
883 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
884 return 1;
885}
886
887# Convert a PDF file to text with the pdftotext command
888
889sub pdf_to_text {
890 my ($dirname, $input_filename, $output_filestem) = @_;
891
892 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
893
894 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
895 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
896 } else {
897 $cmd .= " > \"$output_filestem.err\"";
898 }
899
900 if (system($cmd)!=0)
901 {
902 print STDERR "Error executing $cmd: $!\n";
903 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
904 }
905
906 # make sure there is some extracted text.
907 if (-e "$output_filestem.text") {
908 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
909 binmode(EXTR_TEXT); # just in case...
910 my $line="";
911 my $seen_text=0;
912 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
913 if ($line=~ /\w/) {$seen_text=1;}
914 }
915 close EXTR_TEXT;
916 if ($seen_text==0) { # no text was extracted
917 print STDERR "Error: pdftotext found no text\n";
918 &util::rm("$output_filestem.text");
919 }
920 }
921
922 # make sure the converter made something
923 if (! -s "$output_filestem.text")
924 {
925 # print out the converters std err, if any
926 if (-s "$output_filestem.err") {
927 open (ERRLOG, "$output_filestem.err") || die "$!";
928 print STDERR "pdftotext error log:\n";
929 while (<ERRLOG>) {
930 print STDERR "$_";
931 }
932 close ERRLOG;
933 }
934 # does this converter create a .out file?
935 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
936 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
937 if (-e "$output_filestem.err") {
938 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
939 {
940 open (ERRLOG,"$output_filestem.err");
941 while (<ERRLOG>) {print FAILLOG $_;}
942 close ERRLOG;
943 close FAILLOG;
944 }
945 &util::rm("$output_filestem.err");
946 }
947 return 0;
948 }
949 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
950 return 1;
951}
952
953# Convert a PostScript document to text
954# note - just using "ps2ascii" isn't good enough, as it
955# returns 0 for a postscript interpreter error. ps2ascii is just
956# a wrapper to "gs" anyway, so we use that cmd here.
957
958sub ps_to_text {
959 my ($input_filename, $output_filestem) = @_;
960
961 my $error = "";
962
963 # if we're on windows we'll fall straight through without attempting
964 # to use gs
965 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
966 $error = "Windows does not support gs";
967
968 } else {
969 my $cmd = "";
970 if ($timeout) {$cmd = "ulimit -t $timeout; ";}
971 $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
972 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
973 #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
974 $cmd .= " 2> $output_filestem.err";
975 $!=0;
976
977 my $retcode=system($cmd);
978 $retcode = $? >> 8; # see man perlfunc - system for this...
979 # if system returns -1 | 127 (couldn't start program), look at $! for message
980
981 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
982 elsif (! -e "$output_filestem.text") {
983 $error="did not create output file.\n";
984 }
985 else
986 { # make sure the interpreter didn't get an error. It is technically
987 # possible for the actual text to start with this, but....
988 open PSOUT, "$output_filestem.text";
989 if (<PSOUT> =~ /^Error: (.*)/) {
990 $error="interpreter error - \"$1\"";
991 }
992 close PSOUT;
993 }
994 }
995
996 if ($error ne "")
997 {
998 print STDERR "Warning: Error executing gs: $error\n";
999 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1000
1001 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1002 {
1003 print FAILLOG "gs - $error\n";
1004 if (-e "$output_filestem.err") {
1005 open(ERRLOG, "$output_filestem.err");
1006 while (<ERRLOG>) {print FAILLOG $_;}
1007 close ERRLOG;
1008 }
1009 close FAILLOG;
1010 }
1011 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1012
1013
1014 # Fine then. We'll just do a lousy job by ourselves...
1015 # Based on 5-line regexp sed script found at:
1016 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1017 #
1018 print STDERR "Stripping text from postscript\n";
1019 my $errorcode=0;
1020 open (IN, "$input_filename")
1021 || ($errorcode=1, warn "Couldn't read file: $!");
1022 open (OUT, ">$output_filestem.text")
1023 || ($errorcode=1, warn "Couldn't write file: $!");
1024 if ($errorcode) {print STDERR "errors\n";return 0;}
1025
1026 my $text=""; # this is for whole .ps file...
1027 $text = join('', <IN>); # see man perlport, under "System Resources"
1028 close IN;
1029
1030 # Make sure this is a ps file...
1031 if ($text !~ /^%!/) {
1032 print STDERR "Bad postscript header: not '%!'\n";
1033 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1034 {
1035 print FAILLOG "Bad postscript header: not '%!'\n";
1036 close FAILLOG;
1037 }
1038 return 0;
1039 }
1040
1041 # if ps has Page data, then use it to delete all stuff before it.
1042 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1043
1044 # remove all leading non-data stuff
1045 $text =~ s/^.*?\(//s;
1046
1047 # remove all newline chars for easier processing
1048 $text =~ s/\n//g;
1049
1050 # Big assumption here - assume that if any co-ordinates are
1051 # given, then we are at the end of a sentence.
1052 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1053
1054 # special characters--
1055 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1056
1057 # ? ps text formatting (eg italics?) ?
1058 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1059 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1060 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1061 # default - remove the rest
1062 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1063
1064 # attempt to add whitespace between words...
1065 # this is based purely on observation, and may be completely wrong...
1066 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1067 # eg I notice "b(" is sometimes NOT a space if preceded by a
1068 # negative number.
1069 $text =~ s/\)\d+ ?b\(/\) \( /g;
1070
1071 # change quoted braces to brackets
1072 $text =~ s/([^\\])\\\(/$1\{/g;
1073 $text =~ s/([^\\])\\\)/$1\}/g ;
1074
1075 # remove everything that is not between braces
1076 $text =~ s/\)([^\(\)])+?\(//sg ;
1077
1078 # remove any Trailer eof stuff.
1079 $text =~ s/\)[^\)]*$//sg;
1080
1081 ### ligatures have special characters...
1082 $text =~ s/\\013/ff/g;
1083 $text =~ s/\\014/fi/g;
1084 $text =~ s/\\015/fl/g;
1085 $text =~ s/\\016/ffi/g;
1086 $text =~ s/\\214/fi/g;
1087 $text =~ s/\\215/fl/g;
1088 $text =~ s/\\017/\n\* /g; # asterisk?
1089 $text =~ s/\\023/\023/g; # e acute ('e)
1090 $text =~ s/\\177/\252/g; # u"
1091# $text =~ s/ ?? /\344/g; # a"
1092
1093 print OUT "$text";
1094 close OUT;
1095 }
1096 # wrap the text - use a minimum length. ie, first space after this length.
1097 my $wrap_length=72;
1098 &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1099 open INFILE, "$output_filestem.text.tmp" ||
1100 die "Couldn't open file: $!";
1101 open OUTFILE, ">$output_filestem.text" ||
1102 die "Couldn't open file for writing: $!";
1103 my $line="";
1104 while ($line=<INFILE>) {
1105 while (length($line)>0) {
1106 if (length($line)>$wrap_length) {
1107 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1108 print OUTFILE "$1\n";
1109 } else {
1110 print OUTFILE "$line";
1111 $line="";
1112 }
1113 }
1114 }
1115 close INFILE;
1116 close OUTFILE;
1117 &util::rm("$output_filestem.text.tmp");
1118
1119 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1120 return 1;
1121}
1122
1123
1124# Convert any file to HTML with a crude perl implementation of the
1125# UNIX strings command.
1126
1127sub any_to_html {
1128 ($input_filename, $output_filestem) = @_;
1129
1130 # First generate a text file
1131 return 0 unless (&any_to_text($input_filename, $output_filestem));
1132
1133 # create an HTML file from the text file
1134 open(TEXT, "<$output_filestem.text");
1135 open(HTML, ">$output_filestem.html");
1136
1137 print HTML "<html><head>\n";
1138 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1139 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1140 print HTML "</head><body>\n\n";
1141
1142 my $line;
1143 while ($line=<TEXT>) {
1144 $line =~ s/</&lt;/g;
1145 $line =~ s/>/&gt;/g;
1146 if ($line =~ /^\s*$/) {
1147 print HTML "<p>";
1148 } else {
1149 print HTML "<br> ", $line;
1150 }
1151 }
1152 print HTML "\n</body></html>\n";
1153
1154 close HTML;
1155 close TEXT;
1156
1157 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1158 return 1;
1159}
1160
1161# Convert any file to TEXT with a crude perl implementation of the
1162# UNIX strings command.
1163# Note - this assumes ascii charsets :( (jrm21)
1164
1165sub any_to_text {
1166 ($input_filename, $output_filestem) = @_;
1167
1168 if (!$use_strings) {
1169 return 0;
1170 }
1171
1172 open(IN, "<$input_filename") || return 0;
1173 binmode(IN);
1174 open(OUT, ">$output_filestem.text") || return 0;
1175
1176 my ($line);
1177 my $output_line_count = 0;
1178 while (<IN>) {
1179 $line = $_;
1180
1181 # delete anything that isn't a printable character
1182 $line =~ s/[^\040-\176]+/\n/sg;
1183
1184 # delete any string less than 10 characters long
1185 $line =~ s/^.{0,9}$/\n/mg;
1186 while ($line =~ /^.{1,9}$/m) {
1187 $line =~ s/^.{0,9}$/\n/mg;
1188 $line =~ s/\n+/\n/sg;
1189 }
1190
1191 # remove extraneous whitespace
1192 $line =~ s/\n+/\n/gs;
1193 $line =~ s/^\n//gs;
1194
1195 # output whatever is left
1196 if ($line =~ /[^\n ]/) {
1197 print OUT $line;
1198 ++$output_line_count;
1199 }
1200 }
1201
1202 close OUT;
1203 close IN;
1204
1205 if ($output_line_count) { # try to protect against binary only formats
1206 return 1;
1207 }
1208
1209 &util::rm("$output_filestem.text");
1210 return 0;
1211
1212}
Note: See TracBrowser for help on using the repository browser.