source: trunk/gsdl/bin/script/gsConvert.pl@ 11348

Last change on this file since 11348 was 10534, checked in by chi, 19 years ago

Adding pagedimg types of conversion to PS document.It is through Convert utility of ImageMagick to convert PS documents to different types of image (JPEG, GIF, PNG).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 36.6 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use parsargv;
52use util;
53use Cwd;
54use File::Basename;
55
56# Are we running on WinNT or Win2000 (or later)?
57my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
58if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
59
60my $use_strings;
61my $pdf_complex;
62my $pdf_nohidden;
63my $pdf_zoom;
64my $pdf_ignore_images;
65my $pdf_allow_images_only;
66my $windows_scripting;
67
68sub print_usage
69{
70 print STDERR "\n";
71 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
72 print STDERR " or text using third-party programs.\n\n";
73 print STDERR " usage: $0 [options] filename\n";
74 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
75 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
76 print STDERR "\t-output\tauto|html|text|pagedimg-jpg|pagedimg-gif|pagedimg-png\t(output file type)\n";
77 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
78 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
79 print STDERR "\t-windows_scripting\tuse windows script when converting Microsoft Word and PPT via VB script\n";
80 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
81 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
82 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
83 print STDERR "\t\tconverting PDF to HTML\n";
84 print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
85 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
86 print STDERR "\t\t-pdf_complex is set\n";
87 exit(1);
88}
89
90my $faillogfile="";
91my $timeout=0;
92
93sub main
94{
95 my (@ARGV) = @_;
96 my ($input_type,$output_type,$verbose);
97
98 # read command-line arguments
99 if (!parsargv::parse(\@ARGV,
100 'type/(doc|dot|pdf|ps|ppt|rtf|xls)/', \$input_type,
101 '/errlog/.*/', \$faillogfile,
102 'output/(auto|html|text|pagedimg).*/', \$output_type,
103 'timeout/\d+/0',\$timeout,
104 'verbose/\d+/0', \$verbose,
105 'use_strings', \$use_strings,
106 'windows_scripting',\$windows_scripting,
107 'pdf_complex', \$pdf_complex,
108 'pdf_ignore_images', \$pdf_ignore_images,
109 'pdf_allow_images_only', \$pdf_allow_images_only,
110 'pdf_nohidden', \$pdf_nohidden,
111 'pdf_zoom/\d+/2', \$pdf_zoom
112 ))
113 {
114 print_usage();
115 }
116
117
118 # Make sure the input file exists and can be opened for reading
119 if (scalar(@ARGV!=1)) {
120 print_usage();
121 }
122
123 my $input_filename = $ARGV[0];
124 if (!-r $input_filename) {
125 print STDERR "Error: unable to open $input_filename for reading\n";
126 exit(1);
127 }
128
129 # Deduce filenames
130 my ($tailname,$dirname,$suffix)
131 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
132 my $output_filestem = &util::filename_cat($dirname, "$tailname");
133
134 if ($input_type eq "")
135 {
136 $input_type = lc (substr($suffix,1,length($suffix)-1));
137 }
138
139 # Change to temporary working directory
140 my $stored_dir = cwd();
141 chdir ($dirname) || die "Unable to change to directory $dirname";
142
143 # Select convert utility
144 if (!defined $input_type) {
145 print STDERR "Error: No filename extension or input type defined\n";
146 exit(1);
147 }
148 elsif ($input_type eq "doc" || $input_type eq "dot") {
149 print &convertDOC($input_filename, $output_filestem, $output_type);
150 print "\n";
151 }
152 elsif ($input_type eq "rtf") {
153 print &convertRTF($input_filename, $output_filestem, $output_type);
154 print "\n";
155 }
156 elsif ($input_type eq "pdf") {
157 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
158 print "\n";
159 }
160 elsif ($input_type eq "ps") {
161 print &convertPS($input_filename, $output_filestem, $output_type);
162 print "\n";
163 }
164 elsif ($input_type eq "ppt") {
165 print &convertPPT($input_filename, $output_filestem, $output_type);
166 print "\n";
167 }
168 elsif ($input_type eq "xls") {
169 print &convertXLS($input_filename, $output_filestem, $output_type);
170 print "\n";
171 }
172 else {
173 print STDERR "Error: Unable to convert type '$input_type'\n";
174 exit(1);
175 }
176
177 # restore to original working directory
178 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
179
180}
181
182&main(@ARGV);
183
184
185
186# Document-type conversion functions
187#
188# The following functions attempt to convert documents from their
189# input type to the specified output type. If no output type was
190# given, then they first attempt HTML, and then TEXT.
191#
192# Each returns the output type ("html" or "text") or "fail" if no
193# conversion is possible.
194
195# Convert a Microsoft word document
196
197sub convertDOC {
198 ($input_filename, $output_filestem, $output_type) = @_;
199
200 # Many .doc files are not in fact word documents!
201 my $realtype = &find_docfile_type($input_filename);
202
203 if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
204 return &convertWord678($input_filename, $output_filestem, $output_type);
205 } elsif ($realtype eq "rtf") {
206 return &convertRTF($input_filename, $output_filestem, $output_type);
207 } else {
208 return &convertAnything($input_filename, $output_filestem, $output_type);
209 }
210}
211
212# Convert a Microsoft word 6/7/8 document
213
214sub convertWord678 {
215 ($input_filename, $output_filestem, $output_type) = @_;
216
217 my $success = 0;
218 if (!$output_type || ($output_type =~ /html/i)){
219 if ($windows_scripting) {
220 $success = &native_doc_to_html($input_filename, $output_filestem);
221 }
222 else {
223 $success = &doc_to_html($input_filename, $output_filestem);
224 }
225 if ($success) {
226 return "html";
227 }
228 }
229
230 return &convertAnything($input_filename, $output_filestem, $output_type);
231}
232
233
234# Convert a Rich Text Format (RTF) file
235
236sub convertRTF {
237 ($input_filename, $output_filestem, $output_type) = @_;
238
239 my $success = 0;
240
241 # Attempt specialised conversion to HTML
242 if (!$output_type || ($output_type =~ /html/i)) {
243 $success = &rtf_to_html($input_filename, $output_filestem);
244 if ($success) {
245 return "html";
246 }
247 }
248
249# rtf is so ugly that's it's not worth running strings over.
250# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
251# return &convertAnything($input_filename, $output_filestem, $output_type);
252 return "fail";
253}
254
255
256# Convert an unidentified file
257
258sub convertAnything {
259 ($input_filename, $output_filestem, $output_type) = @_;
260
261 my $success = 0;
262
263 # Attempt simple conversion to HTML
264 if (!$output_type || ($output_type =~ /html/i)) {
265 $success = &any_to_html($input_filename, $output_filestem);
266 if ($success) {
267 return "html";
268 }
269 }
270
271 # Convert to text
272 if (!$output_type || ($output_type =~ /text/i)) {
273 $success = &any_to_text($input_filename, $output_filestem);
274 if ($success) {
275 return "text";
276 }
277 }
278 return "fail";
279}
280
281
282
283# Convert an Adobe PDF document
284
285sub convertPDF {
286 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
287
288 my $success = 0;
289 $output_type =~ s/.*\-(.*)/$1/i;
290 # Attempt coversion to Image
291 if ($output_type =~ /jp?g|gif|png/i) {
292 $success = &pdf_to_img($dirname, $input_filename, $output_filestem, $output_type);
293 if ($success){
294 return "item";
295 }
296 }
297
298 # Attempt conversion to HTML
299 if (!$output_type || ($output_type =~ /html/i)) {
300 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
301 if ($success) {
302 return "html";
303 }
304 }
305
306 # Attempt conversion to TEXT
307 if (!$output_type || ($output_type =~ /text/i)) {
308 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
309 if ($success) {
310 return "text";
311 }
312 }
313
314 return "fail";
315
316}
317
318
319# Convert an Adobe PostScript document
320
321sub convertPS {
322 ($input_filename, $output_filestem, $output_type) = @_;
323
324 my $success = 0;
325 $output_type =~ s/.*\-(.*)/$1/i;
326 # Attempt coversion to Image
327 if ($output_type =~ /jp?g|gif|png/i) {
328 $success = &ps_to_img($dirname, $input_filename, $output_filestem, $output_type);
329 if ($success){
330 return "item";
331 }
332 }
333
334 # Attempt conversion to TEXT
335 if (!$output_type || ($output_type =~ /text/i)) {
336 $success = &ps_to_text($input_filename, $output_filestem);
337 if ($success) {
338 return "text";
339 }
340 }
341 return "fail";
342}
343
344
345sub convertPPT {
346 my ($input_filename, $output_filestem, $output_type) = @_;
347 my $success = 0;
348
349 my $ppt_convert_type = "";
350 #if (!$output_type || $windows_scripting ||($output_type !~ /html/i) ||($output_type !~ /text/i)){
351 if ($windows_scripting && ($output_type !~ /html/i) && ($output_type !~ /text/i)){
352 if ($output_type =~ /gif/i) {
353 $ppt_convert_type = "-g";
354 } elsif ($output_type =~ /jp?g/i){
355 $ppt_convert_type = "-j";
356 } elsif ($output_type =~ /png/i){
357 $ppt_convert_type = "-p";
358 }
359 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
360 $ENV{'GSDLOS'}, "pptextract");
361 $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ /^windows$/i);
362
363 $cmd = "";
364 if ($timeout) {$cmd = "ulimit -t $timeout;";}
365 # if the converting directory has already existed
366 if (-d $output_filestem) {
367 print STDERR "**The conversion directory has existed\n";
368 return "item";
369 } else {
370 $cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
371 $cmd .= " 2>\"$output_filestem.err\""
372 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
373 if (system($cmd) !=0) {
374 print STDERR "Powerpoint VB Scripting convert failed\n";
375 } else {
376 return "item";
377 }
378 }
379 } elsif (!$output_type || ($output_type =~ /html/i)) {
380 # Attempt conversion to HTML
381 #if (!$output_type || ($output_type =~ /html/i)) {
382 # formulate the command
383 $cmd = "";
384 $cmd .= "perl -S ppttohtml.pl ";
385 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
386 $cmd .= " 2>\"$output_filestem.err\""
387 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
388
389 # execute the command
390 $!=0;
391 if (system($cmd)!=0)
392 {
393 print STDERR "Powerpoint 95/97 converter failed $!\n";
394 } else {
395 return "html";
396 }
397 }
398
399 $success = &any_to_text($input_filename, $output_filestem);
400 if ($success) {
401 return "text";
402 }
403
404 return "fail";
405}
406
407
408sub convertXLS {
409 my ($input_filename, $output_filestem, $output_type) = @_;
410
411 my $success = 0;
412
413 # Attempt conversion to HTML
414 if (!$output_type || ($output_type =~ /html/i)) {
415 # formulate the command
416 $cmd = "";
417 $cmd .= "perl -S xlstohtml.pl ";
418 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
419 $cmd .= " 2>\"$output_filestem.err\""
420 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
421
422
423 # execute the command
424 $!=0;
425 if (system($cmd)!=0)
426 {
427 print STDERR "Excel 95/97 converter failed $!\n";
428 } else {
429 return "html";
430 }
431 }
432
433 $success = &any_to_text($input_filename, $output_filestem);
434 if ($success) {
435 return "text";
436 }
437
438 return "fail";
439}
440
441
442
443# Find the real type of a .doc file
444#
445# We seem to have a lot of files with a .doc extension that are .rtf
446# files or Word 5 files. This function attempts to tell the difference.
447sub find_docfile_type {
448 ($input_filename) = @_;
449
450 open(CHK, "<$input_filename");
451 binmode(CHK);
452 my $line = "";
453 my $first = 1;
454
455 while (<CHK>) {
456
457 $line = $_;
458
459 if ($first) {
460 # check to see if this is an rtf file
461 if ($line =~ /^\{\\rtf/) {
462 close(CHK);
463 return "rtf";
464 }
465 $first = 0;
466 }
467
468 # is this is a word 6/7/8 document?
469 if ($line =~ /Word\.Document\.([678])/) {
470 close(CHK);
471 return "word$1";
472 }
473
474 }
475
476 return "unknown";
477}
478
479
480# Specific type-to-type conversions
481#
482# Each of the following functions attempts to convert a document from
483# a specific format to another. If they succeed they return 1 and leave
484# the output document(s) in the appropriate place; if they fail they
485# return 0 and delete any working files.
486
487
488# Attempt to convert a word document to html with the wv program
489sub doc_to_html {
490 ($input_filename, $output_filestem) = @_;
491
492 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
493 $ENV{'GSDLOS'}, "wvWare");
494
495 # don't include path on windows (to avoid having to play about
496 # with quoting when GSDLHOME might contain spaces) but assume
497 # that the PATH is set up correctly
498 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
499
500 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
501 "packages", "wv", "wvHtml.xml");
502
503 my $cmd = "";
504 if ($timeout) {$cmd = "ulimit -t $timeout;";}
505 $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
506 $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
507
508 # redirecting STDERR is a bad idea on windows 95/98
509 $cmd .= " 2> \"$output_filestem.err\""
510 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
511 # execute the command
512 $!=0;
513 if (system($cmd)!=0)
514 {
515 print STDERR "Error executing wv converter:$!\n";
516 if (-s "$output_filestem.err") {
517 open (ERRFILE, "<$output_filestem.err");
518
519 my $write_to_fail_log=0;
520 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
521 {$write_to_fail_log=1;}
522
523 my $line;
524 while ($line=<ERRFILE>) {
525 if ($line =~ /\w/) {
526 print STDERR "$line";
527 print FAILLOG "$line" if ($write_to_fail_log);
528 }
529 if ($line !~ m/startup error/) {next;}
530 print STDERR " (given an invalid .DOC file?)\n";
531 print FAILLOG " (given an invalid .DOC file?)\n"
532 if ($write_to_fail_log);
533
534 } # while ERRFILE
535 close FAILLOG if ($write_to_fail_log);
536 }
537 return 0; # we can try any_to_text
538 }
539
540 # Was the conversion successful?
541
542 if (-s "$output_filestem.html") {
543 open(TMP, "$output_filestem.html");
544 $line = <TMP>;
545 close(TMP);
546 if ($line && $line =~ /DOCTYPE HTML/) {
547 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
548 return 1;
549 }
550 }
551
552 # If here, an error of some sort occurred
553 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
554 if (-e "$output_filestem.err") {
555 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
556 open (ERRLOG,"$output_filestem.err");
557 while (<ERRLOG>) {print FAILLOG $_;}
558 close FAILLOG;
559 close ERRLOG;
560 }
561 &util::rm("$output_filestem.err");
562 }
563
564 return 0;
565}
566
567
568# Attempt to convert a word document to html with the word2html scripting program
569sub native_doc_to_html {
570 ($input_filename, $output_filestem) = @_;
571
572 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
573 $ENV{'GSDLOS'}, "word2html");
574
575 $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ /^windows$/i);
576 if (-e "$output_filestem.html") {
577 print STDERR "*** The conversion file has existed\n";
578 return 1;
579 }
580
581 my $cmd = "";
582 if ($timeout) {$cmd = "ulimit -t $timeout;";}
583 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
584 #$cmd .= "$vbScript $input_filename $output_filestem.html";
585 $cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
586
587 # redirecting STDERR
588 $cmd .= " 2> \"$output_filestem.err\""
589 if ($ENV {'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
590
591 # execute the command
592 $!=0;
593 if (system($cmd)!=0)
594 {
595 print STDERR "Error executing word2Html converter:$!\n";
596 if (-s "$output_filestem.err") {
597 open (ERRFILE, "<$output_filestem.err");
598
599 my $write_to_fail_log=0;
600 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
601 {$write_to_fail_log=1;}
602
603 my $line;
604 while ($line=<ERRFILE>) {
605 if ($line =~ /\w/) {
606 print STDERR "$line";
607 print FAILLOG "$line" if ($write_to_fail_log);
608 }
609 if ($line !~ m/startup error/) {next;}
610 print STDERR " (given an invalid .DOC file?)\n";
611 print FAILLOG " (given an invalid .DOC file?)\n"
612 if ($write_to_fail_log);
613
614 } # while ERRFILE
615 close FAILLOG if ($write_to_fail_log);
616 }
617 return 0; # we can try any_to_text
618 }
619
620 # Was the conversion successful?
621 if (-s "$output_filestem.html") {
622 open(TMP, "$output_filestem.html");
623 $line = <TMP>;
624 close(TMP);
625 if ($line && $line =~ /html/) {
626 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
627 return 1;
628 }
629 }
630
631 # If here, an error of some sort occurred
632 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
633 if (-e "$output_filestem.err") {
634 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
635 open (ERRLOG,"$output_filestem.err");
636 while (<ERRLOG>) {print FAILLOG $_;}
637 close FAILLOG;
638 close ERRLOG;
639 }
640 &util::rm("$output_filestem.err");
641 }
642 return 0;
643}
644
645# Attempt to convert an RTF document to html with rtftohtml
646
647sub rtf_to_html {
648 my ($input_filename, $output_filestem) = @_;
649
650 # formulate the command
651 $cmd = "";
652 if ($timeout) {$cmd = "ulimit -t $timeout;";}
653 $cmd .= "rtftohtml";
654 #$cmd .= "rtf-converter";
655
656 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
657
658 $cmd .= " 2>\"$output_filestem.err\""
659 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000);
660
661
662 # execute the command
663 $!=0;
664 if (system($cmd)!=0)
665 {
666 print STDERR "Error executing rtf converter $!\n";
667 # don't currently bother printing out error log...
668 # keep going, in case it still created an HTML file...
669 }
670
671 # Was the conversion successful?
672 my $was_successful=0;
673 if (-s "$output_filestem.html") {
674 # make sure we have some content other than header
675 open (HTML, "$output_filestem.html"); # what to do if fail?
676 my $line;
677 my $past_header=0;
678 while ($line=<HTML>) {
679
680 if ($past_header == 0) {
681 if ($line =~ /<body>/) {$past_header=1;}
682 next;
683 }
684
685 $line =~ s/<[^>]+>//g;
686 if ($line =~ /\w/ && $past_header) { # we found some content...
687 $was_successful=1;
688 last;
689 }
690 }
691 close HTML;
692 }
693
694 if ($was_successful) {
695 &util::rm("$output_filestem.err")
696 if (-e "$output_filestem.err");
697 # insert the (modified) table of contents, if it exists.
698 if (-e "${output_filestem}_ToC.html") {
699 &util::mv("$output_filestem.html","$output_filestem.src");
700 my $open_failed=0;
701 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
702 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
703 open HTML, ">$output_filestem.html" || ++$open_failed;
704
705 if ($open_failed) {
706 close HTMLSRC;
707 close TOC;
708 close HTML;
709 &util::mv("$output_filestem.src","$output_filestem.html");
710 return 1;
711 }
712
713 # print out header info from src html.
714 while (defined($_ = <HTMLSRC>) && $_ =~ /\w/) {
715 print HTML "$_";
716 }
717
718 # print out table of contents, making links relative
719 <TOC>; <TOC>; # ignore first 2 lines
720 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
721 my $line;
722 while ($line=<TOC>) {
723 $line =~ s@</body></html>$@@ ; # only last line has this
724 # make link relative
725 $line =~ s@href=\"[^\#]+@href=\"@;
726 print HTML $line;
727 }
728 close TOC;
729
730 # rest of html src
731 while (<HTMLSRC>) {
732 print HTML $_;
733 }
734 close HTMLSRC;
735 close HTML;
736
737 &util::rm("${output_filestem}_ToC.html");
738 &util::rm("${output_filestem}.src");
739 }
740 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
741 return 1; # success
742 }
743
744 if (-e "$output_filestem.err") {
745 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
746 {
747 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
748 #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
749 print FAILLOG " (rtf file might be too recent):\n";
750 open (ERRLOG, "$output_filestem.err");
751 while (<ERRLOG>) {print FAILLOG $_;}
752 close ERRLOG;
753 close FAILLOG;
754 }
755 &util::rm("$output_filestem.err");
756 }
757
758 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
759
760 return 0;
761}
762
763
764# Convert a pdf file to html with the pdftohtml command
765
766sub pdf_to_html {
767 my ($dirname, $input_filename, $output_filestem) = @_;
768
769 $cmd = "";
770 if ($timeout) {$cmd = "ulimit -t $timeout;";}
771 $cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
772 $cmd .= " -c" if ($pdf_complex);
773 $cmd .= " -i" if ($pdf_ignore_images);
774 $cmd .= " -a" if ($pdf_allow_images_only);
775 $cmd .= " -hidden" unless ($pdf_nohidden);
776 $cmd .= " \"$input_filename\" \"$output_filestem\"";
777
778 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {
779 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
780 } else {
781 $cmd .= " > \"$output_filestem.err\"";
782 }
783
784 $!=0;
785
786 my $retval=system($cmd);
787 if ($retval!=0)
788 {
789 print STDERR "Error executing pdftohtml.pl";
790 if ($!) {print STDERR ": $!";}
791 print STDERR "\n";
792 }
793
794 # make sure the converter made something
795 if ($retval!=0 || ! -s "$output_filestem.html")
796 {
797 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
798 # print out the converter's std err, if any
799 if (-s "$output_filestem.err") {
800 open (ERRLOG, "$output_filestem.err") || die "$!";
801 print STDERR "pdftohtml error log:\n";
802 while (<ERRLOG>) {
803 print STDERR "$_";
804 }
805 close ERRLOG;
806 }
807 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
808 if (-e "$output_filestem.err") {
809 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
810 {
811 open (ERRLOG, "$output_filestem.err");
812 while (<ERRLOG>) {print FAILLOG $_;}
813 close ERRLOG;
814 close FAILLOG;
815 }
816 &util::rm("$output_filestem.err");
817 }
818 return 0;
819 }
820
821 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
822 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
823 return 1;
824}
825
826# Convert a pdf file to various types of image with the convert command
827
828sub pdf_to_img {
829 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
830
831 # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
832 if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
833 my $result = `identify 2>&1`;
834 if ($? == -1 || $? == 256) { # Linux and Windows return different values for "program not found"
835 #ImageMagick is not installed, thus the convert utility is not available.
836 print STDERR "*** ImageMagick is not installed, the convert utility is not available\n";
837 return 0;
838 }
839 }
840
841 $cmd = "";
842 if ($timeout) {$cmd = "ulimit -t $timeout;";}
843 $output_type =~ s/.*\_(.*)/$1/i;
844 $cmd .= "perl -S pdftoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
845 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {
846 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
847 } else {
848 $cmd .= " > \"$output_filestem.err\"";
849 }
850
851 # don't include path on windows (to avoid having to play about
852 # with quoting when GSDLHOME might contain spaces) but assume
853 # that the PATH is set up correctly
854 $!=0;
855 my $retval=system($cmd);
856 if ($retval!=0)
857 {
858 print STDERR "Error executing pdftoimg.pl";
859 if ($!) {print STDERR ": $!";}
860 print STDERR "\n";
861 }
862
863 #make sure the converter made something
864 #if ($retval !=0) || ! -s "$output_filestem")
865 if ($retval !=0)
866 {
867 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
868 #print out the converter's std err, if any
869 if (-s "$output_filestem.err") {
870 open (ERRLOG, "$output_filestem.err") || die "$!";
871 print STDERR "pdftoimg error log:\n";
872 while (<ERRLOG>) {
873 print STDERR "$_";
874 }
875 close ERRLOG;
876 }
877 #&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
878 if (-e "$output_filestem.err") {
879 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
880 {
881 open (ERRLOG, "$output_filestem.err");
882 while (<ERRLOG>) {print FAILLOG $_;}
883 close ERRLOG;
884 close FAILLOG;
885 }
886 &util::rm("$output_filestem.err");
887 }
888 return 0;
889 }
890 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
891 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
892 return 1;
893}
894
895# Convert a PDF file to text with the pdftotext command
896
897sub pdf_to_text {
898 my ($dirname, $input_filename, $output_filestem) = @_;
899
900 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
901
902 if ($ENV{'GSDLOS'} !~ /^windows$/i) {
903 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
904 } else {
905 $cmd .= " > \"$output_filestem.err\"";
906 }
907
908 if (system($cmd)!=0)
909 {
910 print STDERR "Error executing $cmd: $!\n";
911 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
912 }
913
914 # make sure there is some extracted text.
915 if (-e "$output_filestem.text") {
916 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
917 binmode(EXTR_TEXT); # just in case...
918 my $line="";
919 my $seen_text=0;
920 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
921 if ($line=~ /\w/) {$seen_text=1;}
922 }
923 close EXTR_TEXT;
924 if ($seen_text==0) { # no text was extracted
925 print STDERR "Error: pdftotext found no text\n";
926 &util::rm("$output_filestem.text");
927 }
928 }
929
930 # make sure the converter made something
931 if (! -s "$output_filestem.text")
932 {
933 # print out the converters std err, if any
934 if (-s "$output_filestem.err") {
935 open (ERRLOG, "$output_filestem.err") || die "$!";
936 print STDERR "pdftotext error log:\n";
937 while (<ERRLOG>) {
938 print STDERR "$_";
939 }
940 close ERRLOG;
941 }
942 # does this converter create a .out file?
943 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
944 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
945 if (-e "$output_filestem.err") {
946 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
947 {
948 open (ERRLOG,"$output_filestem.err");
949 while (<ERRLOG>) {print FAILLOG $_;}
950 close ERRLOG;
951 close FAILLOG;
952 }
953 &util::rm("$output_filestem.err");
954 }
955 return 0;
956 }
957 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
958 return 1;
959}
960
961# Convert a PostScript document to text
962# note - just using "ps2ascii" isn't good enough, as it
963# returns 0 for a postscript interpreter error. ps2ascii is just
964# a wrapper to "gs" anyway, so we use that cmd here.
965
966sub ps_to_text {
967 my ($input_filename, $output_filestem) = @_;
968
969 my $error = "";
970
971 # if we're on windows we'll fall straight through without attempting
972 # to use gs
973 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
974 $error = "Windows does not support gs";
975
976 } else {
977 my $cmd = "";
978 if ($timeout) {$cmd = "ulimit -t $timeout; ";}
979 $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
980 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
981 #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
982 $cmd .= " 2> $output_filestem.err";
983 $!=0;
984
985 my $retcode=system($cmd);
986 $retcode = $? >> 8; # see man perlfunc - system for this...
987 # if system returns -1 | 127 (couldn't start program), look at $! for message
988
989 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
990 elsif (! -e "$output_filestem.text") {
991 $error="did not create output file.\n";
992 }
993 else
994 { # make sure the interpreter didn't get an error. It is technically
995 # possible for the actual text to start with this, but....
996 open PSOUT, "$output_filestem.text";
997 if (<PSOUT> =~ /^Error: (.*)/) {
998 $error="interpreter error - \"$1\"";
999 }
1000 close PSOUT;
1001 }
1002 }
1003
1004 if ($error ne "")
1005 {
1006 print STDERR "Warning: Error executing gs: $error\n";
1007 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1008
1009 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1010 {
1011 print FAILLOG "gs - $error\n";
1012 if (-e "$output_filestem.err") {
1013 open(ERRLOG, "$output_filestem.err");
1014 while (<ERRLOG>) {print FAILLOG $_;}
1015 close ERRLOG;
1016 }
1017 close FAILLOG;
1018 }
1019 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1020
1021
1022 # Fine then. We'll just do a lousy job by ourselves...
1023 # Based on 5-line regexp sed script found at:
1024 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1025 #
1026 print STDERR "Stripping text from postscript\n";
1027 my $errorcode=0;
1028 open (IN, "$input_filename")
1029 || ($errorcode=1, warn "Couldn't read file: $!");
1030 open (OUT, ">$output_filestem.text")
1031 || ($errorcode=1, warn "Couldn't write file: $!");
1032 if ($errorcode) {print STDERR "errors\n";return 0;}
1033
1034 my $text=""; # this is for whole .ps file...
1035 $text = join('', <IN>); # see man perlport, under "System Resources"
1036 close IN;
1037
1038 # Make sure this is a ps file...
1039 if ($text !~ /^%!/) {
1040 print STDERR "Bad postscript header: not '%!'\n";
1041 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1042 {
1043 print FAILLOG "Bad postscript header: not '%!'\n";
1044 close FAILLOG;
1045 }
1046 return 0;
1047 }
1048
1049 # if ps has Page data, then use it to delete all stuff before it.
1050 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1051
1052 # remove all leading non-data stuff
1053 $text =~ s/^.*?\(//s;
1054
1055 # remove all newline chars for easier processing
1056 $text =~ s/\n//g;
1057
1058 # Big assumption here - assume that if any co-ordinates are
1059 # given, then we are at the end of a sentence.
1060 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1061
1062 # special characters--
1063 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1064
1065 # ? ps text formatting (eg italics?) ?
1066 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1067 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1068 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1069 # default - remove the rest
1070 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1071
1072 # attempt to add whitespace between words...
1073 # this is based purely on observation, and may be completely wrong...
1074 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1075 # eg I notice "b(" is sometimes NOT a space if preceded by a
1076 # negative number.
1077 $text =~ s/\)\d+ ?b\(/\) \( /g;
1078
1079 # change quoted braces to brackets
1080 $text =~ s/([^\\])\\\(/$1\{/g;
1081 $text =~ s/([^\\])\\\)/$1\}/g ;
1082
1083 # remove everything that is not between braces
1084 $text =~ s/\)([^\(\)])+?\(//sg ;
1085
1086 # remove any Trailer eof stuff.
1087 $text =~ s/\)[^\)]*$//sg;
1088
1089 ### ligatures have special characters...
1090 $text =~ s/\\013/ff/g;
1091 $text =~ s/\\014/fi/g;
1092 $text =~ s/\\015/fl/g;
1093 $text =~ s/\\016/ffi/g;
1094 $text =~ s/\\214/fi/g;
1095 $text =~ s/\\215/fl/g;
1096 $text =~ s/\\017/\n\* /g; # asterisk?
1097 $text =~ s/\\023/\023/g; # e acute ('e)
1098 $text =~ s/\\177/\252/g; # u"
1099# $text =~ s/ ?? /\344/g; # a"
1100
1101 print OUT "$text";
1102 close OUT;
1103 }
1104 # wrap the text - use a minimum length. ie, first space after this length.
1105 my $wrap_length=72;
1106 &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1107 open INFILE, "$output_filestem.text.tmp" ||
1108 die "Couldn't open file: $!";
1109 open OUTFILE, ">$output_filestem.text" ||
1110 die "Couldn't open file for writing: $!";
1111 my $line="";
1112 while ($line=<INFILE>) {
1113 while (length($line)>0) {
1114 if (length($line)>$wrap_length) {
1115 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1116 print OUTFILE "$1\n";
1117 } else {
1118 print OUTFILE "$line";
1119 $line="";
1120 }
1121 }
1122 }
1123 close INFILE;
1124 close OUTFILE;
1125 &util::rm("$output_filestem.text.tmp");
1126
1127 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1128 return 1;
1129}
1130
1131
1132# Convert a PS file to various types of image with the convert utility
1133sub ps_to_img {
1134 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
1135
1136 # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
1137 if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
1138 my $result = `identify 2>&1`;
1139 if ($? == -1 || $? == 256) { # Linux and Windows return different values for "program not found"
1140 #ImageMagick is not installed, thus the convert utility is not available.
1141 print STDERR "*** ImageMagick is not installed, the convert utility is not available\n";
1142 return 0;
1143 }
1144 }
1145
1146 $cmd = "";
1147 if ($timeout) {$cmd = "ulimit -t $timeout;";}
1148 $output_type =~ s/.*\_(.*)/$1/i;
1149 $cmd .= "perl -S pstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
1150 if ($ENV{'GSDLOS'} !~ /^windows$/i || $is_winnt_2000) {
1151 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1152 } else {
1153 $cmd .= " > \"$output_filestem.err\"";
1154 }
1155
1156 # don't include path on windows (to avoid having to play about
1157 # with quoting when GSDLHOME might contain spaces) but assume
1158 # that the PATH is set up correctly
1159 $!=0;
1160 my $retval=system($cmd);
1161 if ($retval!=0)
1162 {
1163 print STDERR "Error executing pstoimg.pl";
1164 if ($!) {print STDERR ": $!";}
1165 print STDERR "\n";
1166 }
1167
1168 #make sure the converter made something
1169 #if ($retval !=0) || ! -s "$output_filestem")
1170 if ($retval !=0)
1171 {
1172 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1173 #print out the converter's std err, if any
1174 if (-s "$output_filestem.err") {
1175 open (ERRLOG, "$output_filestem.err") || die "$!";
1176 print STDERR "pstoimg error log:\n";
1177 while (<ERRLOG>) {
1178 print STDERR "$_";
1179 }
1180 close ERRLOG;
1181 }
1182 #&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1183 if (-e "$output_filestem.err") {
1184 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1185 {
1186 open (ERRLOG, "$output_filestem.err");
1187 while (<ERRLOG>) {print FAILLOG $_;}
1188 close ERRLOG;
1189 close FAILLOG;
1190 }
1191 &util::rm("$output_filestem.err");
1192 }
1193 return 0;
1194 }
1195 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1196 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1197 return 1;
1198}
1199
1200# Convert any file to HTML with a crude perl implementation of the
1201# UNIX strings command.
1202
1203sub any_to_html {
1204 ($input_filename, $output_filestem) = @_;
1205
1206 # First generate a text file
1207 return 0 unless (&any_to_text($input_filename, $output_filestem));
1208
1209 # create an HTML file from the text file
1210 open(TEXT, "<$output_filestem.text");
1211 open(HTML, ">$output_filestem.html");
1212
1213 print HTML "<html><head>\n";
1214 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1215 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1216 print HTML "</head><body>\n\n";
1217
1218 my $line;
1219 while ($line=<TEXT>) {
1220 $line =~ s/</&lt;/g;
1221 $line =~ s/>/&gt;/g;
1222 if ($line =~ /^\s*$/) {
1223 print HTML "<p>";
1224 } else {
1225 print HTML "<br> ", $line;
1226 }
1227 }
1228 print HTML "\n</body></html>\n";
1229
1230 close HTML;
1231 close TEXT;
1232
1233 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1234 return 1;
1235}
1236
1237# Convert any file to TEXT with a crude perl implementation of the
1238# UNIX strings command.
1239# Note - this assumes ascii charsets :( (jrm21)
1240
1241sub any_to_text {
1242 ($input_filename, $output_filestem) = @_;
1243
1244 if (!$use_strings) {
1245 return 0;
1246 }
1247
1248 open(IN, "<$input_filename") || return 0;
1249 binmode(IN);
1250 open(OUT, ">$output_filestem.text") || return 0;
1251
1252 my ($line);
1253 my $output_line_count = 0;
1254 while (<IN>) {
1255 $line = $_;
1256
1257 # delete anything that isn't a printable character
1258 $line =~ s/[^\040-\176]+/\n/sg;
1259
1260 # delete any string less than 10 characters long
1261 $line =~ s/^.{0,9}$/\n/mg;
1262 while ($line =~ /^.{1,9}$/m) {
1263 $line =~ s/^.{0,9}$/\n/mg;
1264 $line =~ s/\n+/\n/sg;
1265 }
1266
1267 # remove extraneous whitespace
1268 $line =~ s/\n+/\n/gs;
1269 $line =~ s/^\n//gs;
1270
1271 # output whatever is left
1272 if ($line =~ /[^\n ]/) {
1273 print OUT $line;
1274 ++$output_line_count;
1275 }
1276 }
1277
1278 close OUT;
1279 close IN;
1280
1281 if ($output_line_count) { # try to protect against binary only formats
1282 return 1;
1283 }
1284
1285 &util::rm("$output_filestem.text");
1286 return 0;
1287
1288}
Note: See TracBrowser for help on using the repository browser.