source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 24375

Last change on this file since 24375 was 24375, checked in by ak19, 11 years ago

Added in verbosity option when launching wvware.pl, so that an unnecessary message can be suppressed at lower verbosity levels.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 35.0 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use strict;
52
53use parsargv;
54use util;
55use Cwd;
56
57# Are we running on WinNT or Win2000 (or later)?
58my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
59if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
60
61my $use_strings;
62my $pdf_complex;
63my $pdf_nohidden;
64my $pdf_zoom;
65my $pdf_ignore_images;
66my $pdf_allow_images_only;
67my $windows_scripting;
68
69sub print_usage
70{
71 print STDERR "\n";
72 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
73 print STDERR " or text using third-party programs.\n\n";
74 print STDERR " usage: $0 [options] filename\n";
75 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
76 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
77 print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
78 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
79 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
80 print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
81 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
82 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
83 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
84 print STDERR "\t\tconverting PDF to HTML\n";
85 print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
86 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
87 print STDERR "\t\t-pdf_complex is set\n";
88 exit(1);
89}
90
91my $faillogfile="";
92my $timeout=0;
93my $verbosity=0;
94
95sub main
96{
97 my (@ARGV) = @_;
98 my ($input_type,$output_type,$verbose);
99
100 # Dynamically figure out what the --type option can support, based on whether -windows_scripting
101 # is in use or not
102 my $default_type_re = "(doc|dot|pdf|ps|ppt|rtf|xls)";
103 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
104 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
105 # Currently only have VBA for Word and PPT(but no XLS)
106 my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xls)";
107
108 my $type_re = $default_type_re;
109
110 foreach my $a (@ARGV) {
111 if ($a =~ m/^windows_scripting$/i) {
112 $type_re = $enhanced_type_re;
113 }
114 }
115
116 # read command-line arguments
117 if (!parsargv::parse(\@ARGV,
118 "type/$type_re/", \$input_type,
119 '/errlog/.*/', \$faillogfile,
120 'output/(auto|html|text|pagedimg).*/', \$output_type,
121 'timeout/\d+/0',\$timeout,
122 'verbose/\d+/0', \$verbose,
123 'windows_scripting',\$windows_scripting,
124 'use_strings', \$use_strings,
125 'pdf_complex', \$pdf_complex,
126 'pdf_ignore_images', \$pdf_ignore_images,
127 'pdf_allow_images_only', \$pdf_allow_images_only,
128 'pdf_nohidden', \$pdf_nohidden,
129 'pdf_zoom/\d+/2', \$pdf_zoom
130 ))
131 {
132 print_usage();
133 }
134
135 $verbosity=$verbose if defined $verbose;
136
137 # Make sure the input file exists and can be opened for reading
138 if (scalar(@ARGV!=1)) {
139 print_usage();
140 }
141
142 my $input_filename = $ARGV[0];
143 if (!-r $input_filename) {
144 print STDERR "Error: unable to open $input_filename for reading\n";
145 exit(1);
146 }
147
148 # Deduce filenames
149 my ($tailname,$dirname,$suffix)
150 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
151 my $output_filestem = &util::filename_cat($dirname, "$tailname");
152
153 if ($input_type eq "")
154 {
155 $input_type = lc (substr($suffix,1,length($suffix)-1));
156 }
157
158 # Change to temporary working directory
159 my $stored_dir = cwd();
160 chdir ($dirname) || die "Unable to change to directory $dirname";
161
162 # Select convert utility
163 if (!defined $input_type) {
164 print STDERR "Error: No filename extension or input type defined\n";
165 exit(1);
166 }
167 elsif ($input_type =~ m/^docx?$/ || $input_type eq "dot") {
168 print &convertDOC($input_filename, $output_filestem, $output_type);
169 print "\n";
170 }
171 elsif ($input_type eq "rtf") {
172 print &convertRTF($input_filename, $output_filestem, $output_type);
173 print "\n";
174 }
175 elsif ($input_type eq "pdf") {
176 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
177 print "\n";
178 }
179 elsif ($input_type eq "ps") {
180 print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
181 print "\n";
182 }
183 elsif ($input_type =~ m/pptx?$/) {
184 print &convertPPT($input_filename, $output_filestem, $output_type);
185 print "\n";
186 }
187 elsif ($input_type =~ m/xlsx?$/) {
188 print &convertXLS($input_filename, $output_filestem, $output_type);
189 print "\n";
190 }
191 else {
192 print STDERR "Error: Unable to convert type '$input_type'\n";
193 exit(1);
194 }
195
196 # restore to original working directory
197 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
198
199}
200
201&main(@ARGV);
202
203
204
205# Document-type conversion functions
206#
207# The following functions attempt to convert documents from their
208# input type to the specified output type. If no output type was
209# given, then they first attempt HTML, and then TEXT.
210#
211# Each returns the output type ("html" or "text") or "fail" if no
212# conversion is possible.
213
214# Convert a Microsoft word document
215
216sub convertDOC {
217 my ($input_filename, $output_filestem, $output_type) = @_;
218
219 # Many .doc files are not in fact word documents!
220 my $realtype = &find_docfile_type($input_filename);
221
222 if ($realtype eq "word6" || $realtype eq "word7"
223 || $realtype eq "word8" || $realtype eq "docx") {
224 return &convertWord678($input_filename, $output_filestem, $output_type);
225 } elsif ($realtype eq "rtf") {
226 return &convertRTF($input_filename, $output_filestem, $output_type);
227 } else {
228 return &convertAnything($input_filename, $output_filestem, $output_type);
229 }
230}
231
232# Convert a Microsoft word 6/7/8 document
233
234sub convertWord678 {
235 my ($input_filename, $output_filestem, $output_type) = @_;
236
237 my $success = 0;
238 if (!$output_type || ($output_type =~ m/html/i)){
239 if ($windows_scripting) {
240 $success = &native_doc_to_html($input_filename, $output_filestem);
241 }
242 else {
243 $success = &doc_to_html($input_filename, $output_filestem);
244 }
245 if ($success) {
246 return "html";
247 }
248 }
249 return &convertAnything($input_filename, $output_filestem, $output_type);
250}
251
252
253# Convert a Rich Text Format (RTF) file
254
255sub convertRTF {
256 my ($input_filename, $output_filestem, $output_type) = @_;
257
258 my $success = 0;
259
260 # Attempt specialised conversion to HTML
261 if (!$output_type || ($output_type =~ m/html/i)) {
262
263 if ($windows_scripting) {
264 $success = &native_doc_to_html($input_filename, $output_filestem);
265 }
266 else {
267 $success = &rtf_to_html($input_filename, $output_filestem);
268 }
269 if ($success) {
270 return "html";
271 }
272 }
273
274# rtf is so ugly that's it's not worth running strings over.
275# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
276# return &convertAnything($input_filename, $output_filestem, $output_type);
277 return "fail";
278}
279
280
281# Convert an unidentified file
282
283sub convertAnything {
284 my ($input_filename, $output_filestem, $output_type) = @_;
285
286 my $success = 0;
287
288 # Attempt simple conversion to HTML
289 if (!$output_type || ($output_type =~ m/html/i)) {
290 $success = &any_to_html($input_filename, $output_filestem);
291 if ($success) {
292 return "html";
293 }
294 }
295
296 # Convert to text
297 if (!$output_type || ($output_type =~ m/text/i)) {
298 $success = &any_to_text($input_filename, $output_filestem);
299 if ($success) {
300 return "text";
301 }
302 }
303 return "fail";
304}
305
306
307
308# Convert an Adobe PDF document
309
310sub convertPDF {
311 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
312
313 my $success = 0;
314 $output_type =~ s/.*\-(.*)/$1/i;
315 # Attempt coversion to Image
316 if ($output_type =~ m/jp?g|gif|png/i) {
317 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
318 if ($success){
319 return "item";
320 }
321 }
322
323 # Attempt conversion to HTML
324 if (!$output_type || ($output_type =~ m/html/i)) {
325 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
326 if ($success) {
327 return "html";
328 }
329 }
330
331 # Attempt conversion to TEXT
332 if (!$output_type || ($output_type =~ m/text/i)) {
333 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
334 if ($success) {
335 return "text";
336 }
337 }
338
339 return "fail";
340
341}
342
343
344# Convert an Adobe PostScript document
345
346sub convertPS {
347 my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
348
349 my $success = 0;
350 $output_type =~ s/.*\-(.*)/$1/i;
351 # Attempt coversion to Image
352 if ($output_type =~ m/jp?g|gif|png/i) {
353 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
354 if ($success){
355 return "item";
356 }
357 }
358
359 # Attempt conversion to TEXT
360 if (!$output_type || ($output_type =~ m/text/i)) {
361 $success = &ps_to_text($input_filename, $output_filestem);
362 if ($success) {
363 return "text";
364 }
365 }
366 return "fail";
367}
368
369
370sub convertPPT {
371 my ($input_filename, $output_filestem, $output_type) = @_;
372 my $success = 0;
373
374 my $ppt_convert_type = "";
375
376 #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
377 if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
378 if ($output_type =~ m/gif/i) {
379 $ppt_convert_type = "-g";
380 } elsif ($output_type =~ m/jp?g/i){
381 $ppt_convert_type = "-j";
382 } elsif ($output_type =~ m/png/i){
383 $ppt_convert_type = "-p";
384 }
385 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
386 $ENV{'GSDLOS'}, "pptextract");
387 $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
388
389 my $cmd = "";
390 if ($timeout) {$cmd = "ulimit -t $timeout;";}
391 # if the converting directory already exists
392 if (-d $output_filestem) {
393 print STDERR "**The conversion directory already exists\n";
394 return "item";
395 } else {
396 $cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
397 $cmd .= " 2>\"$output_filestem.err\""
398 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
399 if (system($cmd) !=0) {
400 print STDERR "Powerpoint VB Scripting convert failed\n";
401 } else {
402 return "item";
403 }
404 }
405 } elsif (!$output_type || ($output_type =~ m/html/i)) {
406 # Attempt conversion to HTML
407 #if (!$output_type || ($output_type =~ m/html/i)) {
408 # formulate the command
409 my $cmd = "";
410 my $full_perl_path = &util::get_perl_exec();
411 $cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
412 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
413 $cmd .= " 2>\"$output_filestem.err\""
414 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
415
416 # execute the command
417 $!=0;
418 if (system($cmd)!=0)
419 {
420 print STDERR "Powerpoint 95/97 converter failed $!\n";
421 } else {
422 return "html";
423 }
424 }
425
426 $success = &any_to_text($input_filename, $output_filestem);
427 if ($success) {
428 return "text";
429 }
430
431 return "fail";
432}
433
434
435sub convertXLS {
436 my ($input_filename, $output_filestem, $output_type) = @_;
437
438 my $success = 0;
439
440 # Attempt conversion to HTML
441 if (!$output_type || ($output_type =~ m/html/i)) {
442 # formulate the command
443 my $cmd = "";
444 my $full_perl_path = &util::get_perl_exec();
445 $cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
446 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
447 $cmd .= " 2>\"$output_filestem.err\""
448 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
449
450
451 # execute the command
452 $!=0;
453 if (system($cmd)!=0)
454 {
455 print STDERR "Excel 95/97 converter failed $!\n";
456 } else {
457 return "html";
458 }
459 }
460
461 $success = &any_to_text($input_filename, $output_filestem);
462 if ($success) {
463 return "text";
464 }
465
466 return "fail";
467}
468
469
470
471# Find the real type of a .doc file
472#
473# We seem to have a lot of files with a .doc extension that are .rtf
474# files or Word 5 files. This function attempts to tell the difference.
475sub find_docfile_type {
476 my ($input_filename) = @_;
477
478 if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
479 return "docx";
480 }
481
482 open(CHK, "<$input_filename");
483 binmode(CHK);
484 my $line = "";
485 my $first = 1;
486
487 while (<CHK>) {
488
489 $line = $_;
490
491 if ($first) {
492 # check to see if this is an rtf file
493 if ($line =~ m/^\{\\rtf/) {
494 close(CHK);
495 return "rtf";
496 }
497 $first = 0;
498 }
499
500 # is this is a word 6/7/8 document?
501 if ($line =~ m/Word\.Document\.([678])/) {
502 close(CHK);
503
504 return "word$1";
505 }
506
507 }
508
509 return "unknown";
510}
511
512
513# Specific type-to-type conversions
514#
515# Each of the following functions attempts to convert a document from
516# a specific format to another. If they succeed they return 1 and leave
517# the output document(s) in the appropriate place; if they fail they
518# return 0 and delete any working files.
519
520
521# Attempt to convert a word document to html with the wv program
522sub doc_to_html {
523 my ($input_filename, $output_filestem) = @_;
524
525 my $wvware_status = 0;
526
527 # need to ensure that the path to perl is quoted (in case there's spaces in it)
528 my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl $input_filename $output_filestem $faillogfile $verbosity $timeout";
529
530# print STDERR "***** wvware launch cmd = $launch_cmd\n";
531
532 $wvware_status = system($launch_cmd)/256;
533 return $wvware_status;
534}
535
536# Attempt to convert a word document to html with the word2html scripting program
537sub native_doc_to_html {
538 my ($input_filename, $output_filestem) = @_;
539
540 # build up the path to the doc-to-html conversion tool we're going to use
541 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
542
543 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
544 # if windows scripting with docx input, use new VBscript to get the local Word install (if
545 # any) to do the conversion, since docX can't be processed by word2html's windows_scripting
546
547 if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
548 # else script launch fails when there are error msgs
549 $vbScript = &util::filename_cat($vbScript, "docx2html.vbs");
550 $vbScript = "CScript //Nologo \"$vbScript\""; # launche with CScript for error output in STDERR
551 # //Nologo flag avoids Microsoft's opening/logo msgs
552 print STDERR "About to use windows scripting to process docx file $input_filename.\n";
553 print STDERR " This may take some time. Please wait...\n";
554 }
555 else { # old doc versions. use the usual VB executable word2html for the
556 # conversion. Doesn't need full path, since bin\windows is on PATH
557 $vbScript = "word2html"; #$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
558 }
559 }
560 else { # not windows
561 $vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
562 }
563
564 if (-e "$output_filestem.html") {
565 print STDERR " The conversion file:\n";
566 print STDERR " $output_filestem.html\n";
567 print STDERR " ... already exists. Skipping\n";
568 return 1;
569 }
570
571 my $cmd = "";
572 if ($timeout) {$cmd = "ulimit -t $timeout;";}
573 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
574 #$cmd .= "$vbScript $input_filename $output_filestem.html";
575 $cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
576
577 # redirecting STDERR
578
579 $cmd .= " 2> \"$output_filestem.err\""
580 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
581 #print STDERR "@@@@@@@@@ cmd=$cmd\n";
582
583 # execute the command
584 $!=0;
585 if (system($cmd)!=0)
586 {
587 print STDERR "Error executing $vbScript converter:$!\n";
588 if (-s "$output_filestem.err") {
589 open (ERRFILE, "<$output_filestem.err");
590
591 my $write_to_fail_log=0;
592 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
593 {$write_to_fail_log=1;}
594
595 my $line;
596 while ($line=<ERRFILE>) {
597 if ($line =~ m/\w/) {
598 print STDERR "$line";
599 print FAILLOG "$line" if ($write_to_fail_log);
600 }
601 if ($line !~ m/startup error/) {next;}
602 print STDERR " (given an invalid .DOC file?)\n";
603 print FAILLOG " (given an invalid .DOC file?)\n"
604 if ($write_to_fail_log);
605
606 } # while ERRFILE
607 close FAILLOG if ($write_to_fail_log);
608 }
609 return 0; # we can try any_to_text
610 }
611
612 # Was the conversion successful?
613 if (-s "$output_filestem.html") {
614 open(TMP, "$output_filestem.html");
615 my $line = <TMP>;
616 close(TMP);
617 if ($line && $line =~ m/html/i) {
618 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
619 return 1;
620 }
621 }
622
623 # If here, an error of some sort occurred
624 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
625 if (-e "$output_filestem.err") {
626 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
627 open (ERRLOG,"$output_filestem.err");
628 while (<ERRLOG>) {print FAILLOG $_;}
629 close FAILLOG;
630 close ERRLOG;
631 }
632 &util::rm("$output_filestem.err");
633 }
634 return 0;
635}
636
637# Attempt to convert an RTF document to html with rtftohtml
638sub rtf_to_html {
639 my ($input_filename, $output_filestem) = @_;
640
641 # formulate the command
642 my $cmd = "";
643 if ($timeout) {$cmd = "ulimit -t $timeout;";}
644 $cmd .= "rtftohtml";
645 #$cmd .= "rtf-converter";
646
647 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
648
649 $cmd .= " 2>\"$output_filestem.err\""
650 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
651
652
653 # execute the command
654 $!=0;
655 if (system($cmd)!=0)
656 {
657 print STDERR "Error executing rtf converter $!\n";
658 # don't currently bother printing out error log...
659 # keep going, in case it still created an HTML file...
660 }
661
662 # Was the conversion successful?
663 my $was_successful=0;
664 if (-s "$output_filestem.html") {
665 # make sure we have some content other than header
666 open (HTML, "$output_filestem.html"); # what to do if fail?
667 my $line;
668 my $past_header=0;
669 while ($line=<HTML>) {
670
671 if ($past_header == 0) {
672 if ($line =~ m/<body>/) {$past_header=1;}
673 next;
674 }
675
676 $line =~ s/<[^>]+>//g;
677 if ($line =~ m/\w/ && $past_header) { # we found some content...
678 $was_successful=1;
679 last;
680 }
681 }
682 close HTML;
683 }
684
685 if ($was_successful) {
686 &util::rm("$output_filestem.err")
687 if (-e "$output_filestem.err");
688 # insert the (modified) table of contents, if it exists.
689 if (-e "${output_filestem}_ToC.html") {
690 &util::mv("$output_filestem.html","$output_filestem.src");
691 my $open_failed=0;
692 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
693 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
694 open HTML, ">$output_filestem.html" || ++$open_failed;
695
696 if ($open_failed) {
697 close HTMLSRC;
698 close TOC;
699 close HTML;
700 &util::mv("$output_filestem.src","$output_filestem.html");
701 return 1;
702 }
703
704 # print out header info from src html.
705 while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
706 print HTML "$_";
707 }
708
709 # print out table of contents, making links relative
710 <TOC>; <TOC>; # ignore first 2 lines
711 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
712 my $line;
713 while ($line=<TOC>) {
714 $line =~ s@</body></html>$@@i ; # only last line has this
715 # make link relative
716 $line =~ s@href=\"[^\#]+@href=\"@i;
717 print HTML $line;
718 }
719 close TOC;
720
721 # rest of html src
722 while (<HTMLSRC>) {
723 print HTML $_;
724 }
725 close HTMLSRC;
726 close HTML;
727
728 &util::rm("${output_filestem}_ToC.html");
729 &util::rm("${output_filestem}.src");
730 }
731 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
732 return 1; # success
733 }
734
735 if (-e "$output_filestem.err") {
736 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
737 {
738 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
739 #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
740 print FAILLOG " (rtf file might be too recent):\n";
741 open (ERRLOG, "$output_filestem.err");
742 while (<ERRLOG>) {print FAILLOG $_;}
743 close ERRLOG;
744 close FAILLOG;
745 }
746 &util::rm("$output_filestem.err");
747 }
748
749 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
750
751 return 0;
752}
753
754
755# Convert a pdf file to html with the pdftohtml command
756
757sub pdf_to_html {
758 my ($dirname, $input_filename, $output_filestem) = @_;
759
760 my $cmd = "";
761 if ($timeout) {$cmd = "ulimit -t $timeout;";}
762 my $full_perl_path = &util::get_perl_exec();
763 $cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
764 $cmd .= " -c" if ($pdf_complex);
765 $cmd .= " -i" if ($pdf_ignore_images);
766 $cmd .= " -a" if ($pdf_allow_images_only);
767 $cmd .= " -hidden" unless ($pdf_nohidden);
768 $cmd .= " \"$input_filename\" \"$output_filestem\"";
769
770 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
771 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
772 } else {
773 $cmd .= " > \"$output_filestem.err\"";
774 }
775
776 $!=0;
777
778 my $retval=system($cmd);
779 if ($retval!=0)
780 {
781 print STDERR "Error executing pdftohtml.pl";
782 if ($!) {print STDERR ": $!";}
783 print STDERR "\n";
784 }
785
786 # make sure the converter made something
787 if ($retval!=0 || ! -s "$output_filestem.html")
788 {
789 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
790 # print out the converter's std err, if any
791 if (-s "$output_filestem.err") {
792 open (ERRLOG, "$output_filestem.err") || die "$!";
793 print STDERR "pdftohtml error log:\n";
794 while (<ERRLOG>) {
795 print STDERR "$_";
796 }
797 close ERRLOG;
798 }
799 print STDERR "***********output filestem $output_filestem.html\n";
800 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
801 if (-e "$output_filestem.err") {
802 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
803 {
804 open (ERRLOG, "$output_filestem.err");
805 while (<ERRLOG>) {print FAILLOG $_;}
806 close ERRLOG;
807 close FAILLOG;
808 }
809 &util::rm("$output_filestem.err");
810 }
811 return 0;
812 }
813
814 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
815 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
816 return 1;
817}
818
819# Convert a pdf file to various types of image with the convert command
820
821sub pdfps_to_img {
822 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
823
824 # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
825 if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
826 my $result = `identify 2>&1`;
827 if ($? == -1 || $? == 256) { # Linux and Windows return different values for "program not found"
828 #ImageMagick is not installed, thus the convert utility is not available.
829 print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
830 return 0;
831 }
832 }
833
834 my $cmd = "";
835 if ($timeout) {$cmd = "ulimit -t $timeout;";}
836 $output_type =~ s/.*\_(.*)/$1/i;
837 my $full_perl_path = &util::get_perl_exec();
838 $cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
839 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
840 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
841 } else {
842 $cmd .= " > \"$output_filestem.err\"";
843 }
844
845 # don't include path on windows (to avoid having to play about
846 # with quoting when GSDLHOME might contain spaces) but assume
847 # that the PATH is set up correctly
848 $!=0;
849 my $retval=system($cmd);
850 if ($retval!=0)
851 {
852 print STDERR "Error executing pdftoimg.pl";
853 if ($!) {print STDERR ": $!";}
854 print STDERR "\n";
855 }
856
857 #make sure the converter made something
858 #if ($retval !=0) || ! -s "$output_filestem")
859 if ($retval !=0)
860 {
861 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
862 #print out the converter's std err, if any
863 if (-s "$output_filestem.err") {
864 open (ERRLOG, "$output_filestem.err") || die "$!";
865 print STDERR "pdfpstoimg error log:\n";
866 while (<ERRLOG>) {
867 print STDERR "$_";
868 }
869 close ERRLOG;
870 }
871 #&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
872 if (-e "$output_filestem.err") {
873 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
874 {
875 open (ERRLOG, "$output_filestem.err");
876 while (<ERRLOG>) {print FAILLOG $_;}
877 close ERRLOG;
878 close FAILLOG;
879 }
880 &util::rm("$output_filestem.err");
881 }
882 return 0;
883 }
884 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
885 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
886 return 1;
887}
888
889# Convert a PDF file to text with the pdftotext command
890
891sub pdf_to_text {
892 my ($dirname, $input_filename, $output_filestem) = @_;
893
894 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
895
896 if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
897 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
898 } else {
899 $cmd .= " > \"$output_filestem.err\"";
900 }
901
902 if (system($cmd)!=0)
903 {
904 print STDERR "Error executing $cmd: $!\n";
905 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
906 }
907
908 # make sure there is some extracted text.
909 if (-e "$output_filestem.text") {
910 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
911 binmode(EXTR_TEXT); # just in case...
912 my $line="";
913 my $seen_text=0;
914 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
915 if ($line=~ m/\w/) {$seen_text=1;}
916 }
917 close EXTR_TEXT;
918 if ($seen_text==0) { # no text was extracted
919 print STDERR "Error: pdftotext found no text\n";
920 &util::rm("$output_filestem.text");
921 }
922 }
923
924 # make sure the converter made something
925 if (! -s "$output_filestem.text")
926 {
927 # print out the converters std err, if any
928 if (-s "$output_filestem.err") {
929 open (ERRLOG, "$output_filestem.err") || die "$!";
930 print STDERR "pdftotext error log:\n";
931 while (<ERRLOG>) {
932 print STDERR "$_";
933 }
934 close ERRLOG;
935 }
936 # does this converter create a .out file?
937 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
938 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
939 if (-e "$output_filestem.err") {
940 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
941 {
942 open (ERRLOG,"$output_filestem.err");
943 while (<ERRLOG>) {print FAILLOG $_;}
944 close ERRLOG;
945 close FAILLOG;
946 }
947 &util::rm("$output_filestem.err");
948 }
949 return 0;
950 }
951 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
952 return 1;
953}
954
955# Convert a PostScript document to text
956# note - just using "ps2ascii" isn't good enough, as it
957# returns 0 for a postscript interpreter error. ps2ascii is just
958# a wrapper to "gs" anyway, so we use that cmd here.
959
960sub ps_to_text {
961 my ($input_filename, $output_filestem) = @_;
962
963 my $error = "";
964
965 # if we're on windows we'll fall straight through without attempting
966 # to use gs
967 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
968 $error = "Windows does not support gs";
969
970 } else {
971 my $cmd = "";
972 if ($timeout) {$cmd = "ulimit -t $timeout; ";}
973 $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
974 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
975 #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
976 $cmd .= " 2> $output_filestem.err";
977 $!=0;
978
979 my $retcode=system($cmd);
980 $retcode = $? >> 8; # see man perlfunc - system for this...
981 # if system returns -1 | 127 (couldn't start program), look at $! for message
982
983 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
984 elsif (! -e "$output_filestem.text") {
985 $error="did not create output file.\n";
986 }
987 else
988 { # make sure the interpreter didn't get an error. It is technically
989 # possible for the actual text to start with this, but....
990 open PSOUT, "$output_filestem.text";
991 if (<PSOUT> =~ m/^Error: (.*)/) {
992 $error="interpreter error - \"$1\"";
993 }
994 close PSOUT;
995 }
996 }
997
998 if ($error ne "")
999 {
1000 print STDERR "Warning: Error executing gs: $error\n";
1001 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1002
1003 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1004 {
1005 print FAILLOG "gs - $error\n";
1006 if (-e "$output_filestem.err") {
1007 open(ERRLOG, "$output_filestem.err");
1008 while (<ERRLOG>) {print FAILLOG $_;}
1009 close ERRLOG;
1010 }
1011 close FAILLOG;
1012 }
1013 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1014
1015
1016 # Fine then. We'll just do a lousy job by ourselves...
1017 # Based on 5-line regexp sed script found at:
1018 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1019 #
1020 print STDERR "Stripping text from postscript\n";
1021 my $errorcode=0;
1022 open (IN, "$input_filename")
1023 || ($errorcode=1, warn "Couldn't read file: $!");
1024 open (OUT, ">$output_filestem.text")
1025 || ($errorcode=1, warn "Couldn't write file: $!");
1026 if ($errorcode) {print STDERR "errors\n";return 0;}
1027
1028 my $text=""; # this is for whole .ps file...
1029 $text = join('', <IN>); # see man perlport, under "System Resources"
1030 close IN;
1031
1032 # Make sure this is a ps file...
1033 if ($text !~ m/^%!/) {
1034 print STDERR "Bad postscript header: not '%!'\n";
1035 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1036 {
1037 print FAILLOG "Bad postscript header: not '%!'\n";
1038 close FAILLOG;
1039 }
1040 return 0;
1041 }
1042
1043 # if ps has Page data, then use it to delete all stuff before it.
1044 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1045
1046 # remove all leading non-data stuff
1047 $text =~ s/^.*?\(//s;
1048
1049 # remove all newline chars for easier processing
1050 $text =~ s/\n//g;
1051
1052 # Big assumption here - assume that if any co-ordinates are
1053 # given, then we are at the end of a sentence.
1054 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1055
1056 # special characters--
1057 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1058
1059 # ? ps text formatting (eg italics?) ?
1060 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1061 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1062 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1063 # default - remove the rest
1064 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1065
1066 # attempt to add whitespace between words...
1067 # this is based purely on observation, and may be completely wrong...
1068 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1069 # eg I notice "b(" is sometimes NOT a space if preceded by a
1070 # negative number.
1071 $text =~ s/\)\d+ ?b\(/\) \( /g;
1072
1073 # change quoted braces to brackets
1074 $text =~ s/([^\\])\\\(/$1\{/g;
1075 $text =~ s/([^\\])\\\)/$1\}/g ;
1076
1077 # remove everything that is not between braces
1078 $text =~ s/\)([^\(\)])+?\(//sg ;
1079
1080 # remove any Trailer eof stuff.
1081 $text =~ s/\)[^\)]*$//sg;
1082
1083 ### ligatures have special characters...
1084 $text =~ s/\\013/ff/g;
1085 $text =~ s/\\014/fi/g;
1086 $text =~ s/\\015/fl/g;
1087 $text =~ s/\\016/ffi/g;
1088 $text =~ s/\\214/fi/g;
1089 $text =~ s/\\215/fl/g;
1090 $text =~ s/\\017/\n\* /g; # asterisk?
1091 $text =~ s/\\023/\023/g; # e acute ('e)
1092 $text =~ s/\\177/\252/g; # u"
1093# $text =~ s/ ?? /\344/g; # a"
1094
1095 print OUT "$text";
1096 close OUT;
1097 }
1098 # wrap the text - use a minimum length. ie, first space after this length.
1099 my $wrap_length=72;
1100 &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1101 open INFILE, "$output_filestem.text.tmp" ||
1102 die "Couldn't open file: $!";
1103 open OUTFILE, ">$output_filestem.text" ||
1104 die "Couldn't open file for writing: $!";
1105 my $line="";
1106 while ($line=<INFILE>) {
1107 while (length($line)>0) {
1108 if (length($line)>$wrap_length) {
1109 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1110 print OUTFILE "$1\n";
1111 } else {
1112 print OUTFILE "$line";
1113 $line="";
1114 }
1115 }
1116 }
1117 close INFILE;
1118 close OUTFILE;
1119 &util::rm("$output_filestem.text.tmp");
1120
1121 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1122 return 1;
1123}
1124
1125
1126# Convert any file to HTML with a crude perl implementation of the
1127# UNIX strings command.
1128
1129sub any_to_html {
1130 my ($input_filename, $output_filestem) = @_;
1131
1132 # First generate a text file
1133 return 0 unless (&any_to_text($input_filename, $output_filestem));
1134
1135 # create an HTML file from the text file
1136 open(TEXT, "<$output_filestem.text");
1137 open(HTML, ">$output_filestem.html");
1138
1139 print HTML "<html><head>\n";
1140 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1141 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1142 print HTML "</head><body>\n\n";
1143
1144 my $line;
1145 while ($line=<TEXT>) {
1146 $line =~ s/</&lt;/g;
1147 $line =~ s/>/&gt;/g;
1148 if ($line =~ m/^\s*$/) {
1149 print HTML "<p>";
1150 } else {
1151 print HTML "<br> ", $line;
1152 }
1153 }
1154 print HTML "\n</body></html>\n";
1155
1156 close HTML;
1157 close TEXT;
1158
1159 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1160 return 1;
1161}
1162
1163# Convert any file to TEXT with a crude perl implementation of the
1164# UNIX strings command.
1165# Note - this assumes ascii charsets :( (jrm21)
1166
1167sub any_to_text {
1168 my ($input_filename, $output_filestem) = @_;
1169
1170 if (!$use_strings) {
1171 return 0;
1172 }
1173
1174 print STDERR "\n**** In any to text****\n\n";
1175 open(IN, "<$input_filename") || return 0;
1176 binmode(IN);
1177 open(OUT, ">$output_filestem.text") || return 0;
1178
1179 my ($line);
1180 my $output_line_count = 0;
1181 while (<IN>) {
1182 $line = $_;
1183
1184 # delete anything that isn't a printable character
1185 $line =~ s/[^\040-\176]+/\n/sg;
1186
1187 # delete any string less than 10 characters long
1188 $line =~ s/^.{0,9}$/\n/mg;
1189 while ($line =~ m/^.{1,9}$/m) {
1190 $line =~ s/^.{0,9}$/\n/mg;
1191 $line =~ s/\n+/\n/sg;
1192 }
1193
1194 # remove extraneous whitespace
1195 $line =~ s/\n+/\n/gs;
1196 $line =~ s/^\n//gs;
1197
1198 # output whatever is left
1199 if ($line =~ m/[^\n ]/) {
1200 print OUT $line;
1201 ++$output_line_count;
1202 }
1203 }
1204
1205 close OUT;
1206 close IN;
1207
1208 if ($output_line_count) { # try to protect against binary only formats
1209 return 1;
1210 }
1211
1212 &util::rm("$output_filestem.text");
1213 return 0;
1214
1215}
Note: See TracBrowser for help on using the repository browser.