source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 24371

Last change on this file since 24371 was 24371, checked in by ak19, 11 years ago

Ticket 779: the new wvware.pl script sets the environment for what wvware needs, by setting the LD_LIB_PATH to gnome-lib-minimal in the extension folder, if this exists. wvware.pl is called by gsConvert to run wvware (also checked with the replace src doc with html menu option on rightclick) and the perl script can be launched from the command prompt to do the conversion as well.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 34.9 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use strict;
52
53use parsargv;
54use util;
55use Cwd;
56
57# Are we running on WinNT or Win2000 (or later)?
58my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
59if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
60
61my $use_strings;
62my $pdf_complex;
63my $pdf_nohidden;
64my $pdf_zoom;
65my $pdf_ignore_images;
66my $pdf_allow_images_only;
67my $windows_scripting;
68
69sub print_usage
70{
71 print STDERR "\n";
72 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
73 print STDERR " or text using third-party programs.\n\n";
74 print STDERR " usage: $0 [options] filename\n";
75 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
76 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
77 print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
78 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
79 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
80 print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
81 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
82 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
83 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
84 print STDERR "\t\tconverting PDF to HTML\n";
85 print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
86 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
87 print STDERR "\t\t-pdf_complex is set\n";
88 exit(1);
89}
90
91my $faillogfile="";
92my $timeout=0;
93
94sub main
95{
96 my (@ARGV) = @_;
97 my ($input_type,$output_type,$verbose);
98
99 # Dynamically figure out what the --type option can support, based on whether -windows_scripting
100 # is in use or not
101 my $default_type_re = "(doc|dot|pdf|ps|ppt|rtf|xls)";
102 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
103 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
104 # Currently only have VBA for Word and PPT(but no XLS)
105 my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xls)";
106
107 my $type_re = $default_type_re;
108
109 foreach my $a (@ARGV) {
110 if ($a =~ m/^windows_scripting$/i) {
111 $type_re = $enhanced_type_re;
112 }
113 }
114
115 # read command-line arguments
116 if (!parsargv::parse(\@ARGV,
117 "type/$type_re/", \$input_type,
118 '/errlog/.*/', \$faillogfile,
119 'output/(auto|html|text|pagedimg).*/', \$output_type,
120 'timeout/\d+/0',\$timeout,
121 'verbose/\d+/0', \$verbose,
122 'windows_scripting',\$windows_scripting,
123 'use_strings', \$use_strings,
124 'pdf_complex', \$pdf_complex,
125 'pdf_ignore_images', \$pdf_ignore_images,
126 'pdf_allow_images_only', \$pdf_allow_images_only,
127 'pdf_nohidden', \$pdf_nohidden,
128 'pdf_zoom/\d+/2', \$pdf_zoom
129 ))
130 {
131 print_usage();
132 }
133
134 # Make sure the input file exists and can be opened for reading
135 if (scalar(@ARGV!=1)) {
136 print_usage();
137 }
138
139 my $input_filename = $ARGV[0];
140 if (!-r $input_filename) {
141 print STDERR "Error: unable to open $input_filename for reading\n";
142 exit(1);
143 }
144
145 # Deduce filenames
146 my ($tailname,$dirname,$suffix)
147 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
148 my $output_filestem = &util::filename_cat($dirname, "$tailname");
149
150 if ($input_type eq "")
151 {
152 $input_type = lc (substr($suffix,1,length($suffix)-1));
153 }
154
155 # Change to temporary working directory
156 my $stored_dir = cwd();
157 chdir ($dirname) || die "Unable to change to directory $dirname";
158
159 # Select convert utility
160 if (!defined $input_type) {
161 print STDERR "Error: No filename extension or input type defined\n";
162 exit(1);
163 }
164 elsif ($input_type =~ m/^docx?$/ || $input_type eq "dot") {
165 print &convertDOC($input_filename, $output_filestem, $output_type);
166 print "\n";
167 }
168 elsif ($input_type eq "rtf") {
169 print &convertRTF($input_filename, $output_filestem, $output_type);
170 print "\n";
171 }
172 elsif ($input_type eq "pdf") {
173 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
174 print "\n";
175 }
176 elsif ($input_type eq "ps") {
177 print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
178 print "\n";
179 }
180 elsif ($input_type =~ m/pptx?$/) {
181 print &convertPPT($input_filename, $output_filestem, $output_type);
182 print "\n";
183 }
184 elsif ($input_type =~ m/xlsx?$/) {
185 print &convertXLS($input_filename, $output_filestem, $output_type);
186 print "\n";
187 }
188 else {
189 print STDERR "Error: Unable to convert type '$input_type'\n";
190 exit(1);
191 }
192
193 # restore to original working directory
194 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
195
196}
197
198&main(@ARGV);
199
200
201
202# Document-type conversion functions
203#
204# The following functions attempt to convert documents from their
205# input type to the specified output type. If no output type was
206# given, then they first attempt HTML, and then TEXT.
207#
208# Each returns the output type ("html" or "text") or "fail" if no
209# conversion is possible.
210
211# Convert a Microsoft word document
212
213sub convertDOC {
214 my ($input_filename, $output_filestem, $output_type) = @_;
215
216 # Many .doc files are not in fact word documents!
217 my $realtype = &find_docfile_type($input_filename);
218
219 if ($realtype eq "word6" || $realtype eq "word7"
220 || $realtype eq "word8" || $realtype eq "docx") {
221 return &convertWord678($input_filename, $output_filestem, $output_type);
222 } elsif ($realtype eq "rtf") {
223 return &convertRTF($input_filename, $output_filestem, $output_type);
224 } else {
225 return &convertAnything($input_filename, $output_filestem, $output_type);
226 }
227}
228
229# Convert a Microsoft word 6/7/8 document
230
231sub convertWord678 {
232 my ($input_filename, $output_filestem, $output_type) = @_;
233
234 my $success = 0;
235 if (!$output_type || ($output_type =~ m/html/i)){
236 if ($windows_scripting) {
237 $success = &native_doc_to_html($input_filename, $output_filestem);
238 }
239 else {
240 $success = &doc_to_html($input_filename, $output_filestem);
241 }
242 if ($success) {
243 return "html";
244 }
245 }
246 return &convertAnything($input_filename, $output_filestem, $output_type);
247}
248
249
250# Convert a Rich Text Format (RTF) file
251
252sub convertRTF {
253 my ($input_filename, $output_filestem, $output_type) = @_;
254
255 my $success = 0;
256
257 # Attempt specialised conversion to HTML
258 if (!$output_type || ($output_type =~ m/html/i)) {
259
260 if ($windows_scripting) {
261 $success = &native_doc_to_html($input_filename, $output_filestem);
262 }
263 else {
264 $success = &rtf_to_html($input_filename, $output_filestem);
265 }
266 if ($success) {
267 return "html";
268 }
269 }
270
271# rtf is so ugly that's it's not worth running strings over.
272# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
273# return &convertAnything($input_filename, $output_filestem, $output_type);
274 return "fail";
275}
276
277
278# Convert an unidentified file
279
280sub convertAnything {
281 my ($input_filename, $output_filestem, $output_type) = @_;
282
283 my $success = 0;
284
285 # Attempt simple conversion to HTML
286 if (!$output_type || ($output_type =~ m/html/i)) {
287 $success = &any_to_html($input_filename, $output_filestem);
288 if ($success) {
289 return "html";
290 }
291 }
292
293 # Convert to text
294 if (!$output_type || ($output_type =~ m/text/i)) {
295 $success = &any_to_text($input_filename, $output_filestem);
296 if ($success) {
297 return "text";
298 }
299 }
300 return "fail";
301}
302
303
304
305# Convert an Adobe PDF document
306
307sub convertPDF {
308 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
309
310 my $success = 0;
311 $output_type =~ s/.*\-(.*)/$1/i;
312 # Attempt coversion to Image
313 if ($output_type =~ m/jp?g|gif|png/i) {
314 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
315 if ($success){
316 return "item";
317 }
318 }
319
320 # Attempt conversion to HTML
321 if (!$output_type || ($output_type =~ m/html/i)) {
322 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
323 if ($success) {
324 return "html";
325 }
326 }
327
328 # Attempt conversion to TEXT
329 if (!$output_type || ($output_type =~ m/text/i)) {
330 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
331 if ($success) {
332 return "text";
333 }
334 }
335
336 return "fail";
337
338}
339
340
341# Convert an Adobe PostScript document
342
343sub convertPS {
344 my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
345
346 my $success = 0;
347 $output_type =~ s/.*\-(.*)/$1/i;
348 # Attempt coversion to Image
349 if ($output_type =~ m/jp?g|gif|png/i) {
350 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
351 if ($success){
352 return "item";
353 }
354 }
355
356 # Attempt conversion to TEXT
357 if (!$output_type || ($output_type =~ m/text/i)) {
358 $success = &ps_to_text($input_filename, $output_filestem);
359 if ($success) {
360 return "text";
361 }
362 }
363 return "fail";
364}
365
366
367sub convertPPT {
368 my ($input_filename, $output_filestem, $output_type) = @_;
369 my $success = 0;
370
371 my $ppt_convert_type = "";
372
373 #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
374 if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
375 if ($output_type =~ m/gif/i) {
376 $ppt_convert_type = "-g";
377 } elsif ($output_type =~ m/jp?g/i){
378 $ppt_convert_type = "-j";
379 } elsif ($output_type =~ m/png/i){
380 $ppt_convert_type = "-p";
381 }
382 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
383 $ENV{'GSDLOS'}, "pptextract");
384 $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
385
386 my $cmd = "";
387 if ($timeout) {$cmd = "ulimit -t $timeout;";}
388 # if the converting directory already exists
389 if (-d $output_filestem) {
390 print STDERR "**The conversion directory already exists\n";
391 return "item";
392 } else {
393 $cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
394 $cmd .= " 2>\"$output_filestem.err\""
395 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
396 if (system($cmd) !=0) {
397 print STDERR "Powerpoint VB Scripting convert failed\n";
398 } else {
399 return "item";
400 }
401 }
402 } elsif (!$output_type || ($output_type =~ m/html/i)) {
403 # Attempt conversion to HTML
404 #if (!$output_type || ($output_type =~ m/html/i)) {
405 # formulate the command
406 my $cmd = "";
407 my $full_perl_path = &util::get_perl_exec();
408 $cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
409 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
410 $cmd .= " 2>\"$output_filestem.err\""
411 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
412
413 # execute the command
414 $!=0;
415 if (system($cmd)!=0)
416 {
417 print STDERR "Powerpoint 95/97 converter failed $!\n";
418 } else {
419 return "html";
420 }
421 }
422
423 $success = &any_to_text($input_filename, $output_filestem);
424 if ($success) {
425 return "text";
426 }
427
428 return "fail";
429}
430
431
432sub convertXLS {
433 my ($input_filename, $output_filestem, $output_type) = @_;
434
435 my $success = 0;
436
437 # Attempt conversion to HTML
438 if (!$output_type || ($output_type =~ m/html/i)) {
439 # formulate the command
440 my $cmd = "";
441 my $full_perl_path = &util::get_perl_exec();
442 $cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
443 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
444 $cmd .= " 2>\"$output_filestem.err\""
445 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
446
447
448 # execute the command
449 $!=0;
450 if (system($cmd)!=0)
451 {
452 print STDERR "Excel 95/97 converter failed $!\n";
453 } else {
454 return "html";
455 }
456 }
457
458 $success = &any_to_text($input_filename, $output_filestem);
459 if ($success) {
460 return "text";
461 }
462
463 return "fail";
464}
465
466
467
468# Find the real type of a .doc file
469#
470# We seem to have a lot of files with a .doc extension that are .rtf
471# files or Word 5 files. This function attempts to tell the difference.
472sub find_docfile_type {
473 my ($input_filename) = @_;
474
475 if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
476 return "docx";
477 }
478
479 open(CHK, "<$input_filename");
480 binmode(CHK);
481 my $line = "";
482 my $first = 1;
483
484 while (<CHK>) {
485
486 $line = $_;
487
488 if ($first) {
489 # check to see if this is an rtf file
490 if ($line =~ m/^\{\\rtf/) {
491 close(CHK);
492 return "rtf";
493 }
494 $first = 0;
495 }
496
497 # is this is a word 6/7/8 document?
498 if ($line =~ m/Word\.Document\.([678])/) {
499 close(CHK);
500
501 return "word$1";
502 }
503
504 }
505
506 return "unknown";
507}
508
509
510# Specific type-to-type conversions
511#
512# Each of the following functions attempts to convert a document from
513# a specific format to another. If they succeed they return 1 and leave
514# the output document(s) in the appropriate place; if they fail they
515# return 0 and delete any working files.
516
517
518# Attempt to convert a word document to html with the wv program
519sub doc_to_html {
520 my ($input_filename, $output_filestem) = @_;
521
522 my $wvware_status = 0;
523
524 # need to ensure that the path to perl is quoted (in case there's spaces in it)
525 my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl $input_filename $output_filestem $faillogfile $timeout";
526
527# print STDERR "***** wvware launch cmd = $launch_cmd\n";
528
529 $wvware_status = system($launch_cmd)/256;
530 return $wvware_status;
531}
532
533# Attempt to convert a word document to html with the word2html scripting program
534sub native_doc_to_html {
535 my ($input_filename, $output_filestem) = @_;
536
537 # build up the path to the doc-to-html conversion tool we're going to use
538 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
539
540 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
541 # if windows scripting with docx input, use new VBscript to get the local Word install (if
542 # any) to do the conversion, since docX can't be processed by word2html's windows_scripting
543
544 if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
545 # else script launch fails when there are error msgs
546 $vbScript = &util::filename_cat($vbScript, "docx2html.vbs");
547 $vbScript = "CScript //Nologo \"$vbScript\""; # launche with CScript for error output in STDERR
548 # //Nologo flag avoids Microsoft's opening/logo msgs
549 print STDERR "About to use windows scripting to process docx file $input_filename.\n";
550 print STDERR " This may take some time. Please wait...\n";
551 }
552 else { # old doc versions. use the usual VB executable word2html for the
553 # conversion. Doesn't need full path, since bin\windows is on PATH
554 $vbScript = "word2html"; #$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
555 }
556 }
557 else { # not windows
558 $vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
559 }
560
561 if (-e "$output_filestem.html") {
562 print STDERR " The conversion file:\n";
563 print STDERR " $output_filestem.html\n";
564 print STDERR " ... already exists. Skipping\n";
565 return 1;
566 }
567
568 my $cmd = "";
569 if ($timeout) {$cmd = "ulimit -t $timeout;";}
570 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
571 #$cmd .= "$vbScript $input_filename $output_filestem.html";
572 $cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
573
574 # redirecting STDERR
575
576 $cmd .= " 2> \"$output_filestem.err\""
577 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
578 #print STDERR "@@@@@@@@@ cmd=$cmd\n";
579
580 # execute the command
581 $!=0;
582 if (system($cmd)!=0)
583 {
584 print STDERR "Error executing $vbScript converter:$!\n";
585 if (-s "$output_filestem.err") {
586 open (ERRFILE, "<$output_filestem.err");
587
588 my $write_to_fail_log=0;
589 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
590 {$write_to_fail_log=1;}
591
592 my $line;
593 while ($line=<ERRFILE>) {
594 if ($line =~ m/\w/) {
595 print STDERR "$line";
596 print FAILLOG "$line" if ($write_to_fail_log);
597 }
598 if ($line !~ m/startup error/) {next;}
599 print STDERR " (given an invalid .DOC file?)\n";
600 print FAILLOG " (given an invalid .DOC file?)\n"
601 if ($write_to_fail_log);
602
603 } # while ERRFILE
604 close FAILLOG if ($write_to_fail_log);
605 }
606 return 0; # we can try any_to_text
607 }
608
609 # Was the conversion successful?
610 if (-s "$output_filestem.html") {
611 open(TMP, "$output_filestem.html");
612 my $line = <TMP>;
613 close(TMP);
614 if ($line && $line =~ m/html/i) {
615 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
616 return 1;
617 }
618 }
619
620 # If here, an error of some sort occurred
621 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
622 if (-e "$output_filestem.err") {
623 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
624 open (ERRLOG,"$output_filestem.err");
625 while (<ERRLOG>) {print FAILLOG $_;}
626 close FAILLOG;
627 close ERRLOG;
628 }
629 &util::rm("$output_filestem.err");
630 }
631 return 0;
632}
633
634# Attempt to convert an RTF document to html with rtftohtml
635sub rtf_to_html {
636 my ($input_filename, $output_filestem) = @_;
637
638 # formulate the command
639 my $cmd = "";
640 if ($timeout) {$cmd = "ulimit -t $timeout;";}
641 $cmd .= "rtftohtml";
642 #$cmd .= "rtf-converter";
643
644 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
645
646 $cmd .= " 2>\"$output_filestem.err\""
647 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
648
649
650 # execute the command
651 $!=0;
652 if (system($cmd)!=0)
653 {
654 print STDERR "Error executing rtf converter $!\n";
655 # don't currently bother printing out error log...
656 # keep going, in case it still created an HTML file...
657 }
658
659 # Was the conversion successful?
660 my $was_successful=0;
661 if (-s "$output_filestem.html") {
662 # make sure we have some content other than header
663 open (HTML, "$output_filestem.html"); # what to do if fail?
664 my $line;
665 my $past_header=0;
666 while ($line=<HTML>) {
667
668 if ($past_header == 0) {
669 if ($line =~ m/<body>/) {$past_header=1;}
670 next;
671 }
672
673 $line =~ s/<[^>]+>//g;
674 if ($line =~ m/\w/ && $past_header) { # we found some content...
675 $was_successful=1;
676 last;
677 }
678 }
679 close HTML;
680 }
681
682 if ($was_successful) {
683 &util::rm("$output_filestem.err")
684 if (-e "$output_filestem.err");
685 # insert the (modified) table of contents, if it exists.
686 if (-e "${output_filestem}_ToC.html") {
687 &util::mv("$output_filestem.html","$output_filestem.src");
688 my $open_failed=0;
689 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
690 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
691 open HTML, ">$output_filestem.html" || ++$open_failed;
692
693 if ($open_failed) {
694 close HTMLSRC;
695 close TOC;
696 close HTML;
697 &util::mv("$output_filestem.src","$output_filestem.html");
698 return 1;
699 }
700
701 # print out header info from src html.
702 while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
703 print HTML "$_";
704 }
705
706 # print out table of contents, making links relative
707 <TOC>; <TOC>; # ignore first 2 lines
708 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
709 my $line;
710 while ($line=<TOC>) {
711 $line =~ s@</body></html>$@@i ; # only last line has this
712 # make link relative
713 $line =~ s@href=\"[^\#]+@href=\"@i;
714 print HTML $line;
715 }
716 close TOC;
717
718 # rest of html src
719 while (<HTMLSRC>) {
720 print HTML $_;
721 }
722 close HTMLSRC;
723 close HTML;
724
725 &util::rm("${output_filestem}_ToC.html");
726 &util::rm("${output_filestem}.src");
727 }
728 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
729 return 1; # success
730 }
731
732 if (-e "$output_filestem.err") {
733 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
734 {
735 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
736 #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
737 print FAILLOG " (rtf file might be too recent):\n";
738 open (ERRLOG, "$output_filestem.err");
739 while (<ERRLOG>) {print FAILLOG $_;}
740 close ERRLOG;
741 close FAILLOG;
742 }
743 &util::rm("$output_filestem.err");
744 }
745
746 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
747
748 return 0;
749}
750
751
752# Convert a pdf file to html with the pdftohtml command
753
754sub pdf_to_html {
755 my ($dirname, $input_filename, $output_filestem) = @_;
756
757 my $cmd = "";
758 if ($timeout) {$cmd = "ulimit -t $timeout;";}
759 my $full_perl_path = &util::get_perl_exec();
760 $cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
761 $cmd .= " -c" if ($pdf_complex);
762 $cmd .= " -i" if ($pdf_ignore_images);
763 $cmd .= " -a" if ($pdf_allow_images_only);
764 $cmd .= " -hidden" unless ($pdf_nohidden);
765 $cmd .= " \"$input_filename\" \"$output_filestem\"";
766
767 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
768 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
769 } else {
770 $cmd .= " > \"$output_filestem.err\"";
771 }
772
773 $!=0;
774
775 my $retval=system($cmd);
776 if ($retval!=0)
777 {
778 print STDERR "Error executing pdftohtml.pl";
779 if ($!) {print STDERR ": $!";}
780 print STDERR "\n";
781 }
782
783 # make sure the converter made something
784 if ($retval!=0 || ! -s "$output_filestem.html")
785 {
786 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
787 # print out the converter's std err, if any
788 if (-s "$output_filestem.err") {
789 open (ERRLOG, "$output_filestem.err") || die "$!";
790 print STDERR "pdftohtml error log:\n";
791 while (<ERRLOG>) {
792 print STDERR "$_";
793 }
794 close ERRLOG;
795 }
796 print STDERR "***********output filestem $output_filestem.html\n";
797 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
798 if (-e "$output_filestem.err") {
799 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
800 {
801 open (ERRLOG, "$output_filestem.err");
802 while (<ERRLOG>) {print FAILLOG $_;}
803 close ERRLOG;
804 close FAILLOG;
805 }
806 &util::rm("$output_filestem.err");
807 }
808 return 0;
809 }
810
811 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
812 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
813 return 1;
814}
815
816# Convert a pdf file to various types of image with the convert command
817
818sub pdfps_to_img {
819 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
820
821 # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
822 if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
823 my $result = `identify 2>&1`;
824 if ($? == -1 || $? == 256) { # Linux and Windows return different values for "program not found"
825 #ImageMagick is not installed, thus the convert utility is not available.
826 print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
827 return 0;
828 }
829 }
830
831 my $cmd = "";
832 if ($timeout) {$cmd = "ulimit -t $timeout;";}
833 $output_type =~ s/.*\_(.*)/$1/i;
834 my $full_perl_path = &util::get_perl_exec();
835 $cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
836 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
837 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
838 } else {
839 $cmd .= " > \"$output_filestem.err\"";
840 }
841
842 # don't include path on windows (to avoid having to play about
843 # with quoting when GSDLHOME might contain spaces) but assume
844 # that the PATH is set up correctly
845 $!=0;
846 my $retval=system($cmd);
847 if ($retval!=0)
848 {
849 print STDERR "Error executing pdftoimg.pl";
850 if ($!) {print STDERR ": $!";}
851 print STDERR "\n";
852 }
853
854 #make sure the converter made something
855 #if ($retval !=0) || ! -s "$output_filestem")
856 if ($retval !=0)
857 {
858 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
859 #print out the converter's std err, if any
860 if (-s "$output_filestem.err") {
861 open (ERRLOG, "$output_filestem.err") || die "$!";
862 print STDERR "pdfpstoimg error log:\n";
863 while (<ERRLOG>) {
864 print STDERR "$_";
865 }
866 close ERRLOG;
867 }
868 #&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
869 if (-e "$output_filestem.err") {
870 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
871 {
872 open (ERRLOG, "$output_filestem.err");
873 while (<ERRLOG>) {print FAILLOG $_;}
874 close ERRLOG;
875 close FAILLOG;
876 }
877 &util::rm("$output_filestem.err");
878 }
879 return 0;
880 }
881 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
882 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
883 return 1;
884}
885
886# Convert a PDF file to text with the pdftotext command
887
888sub pdf_to_text {
889 my ($dirname, $input_filename, $output_filestem) = @_;
890
891 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
892
893 if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
894 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
895 } else {
896 $cmd .= " > \"$output_filestem.err\"";
897 }
898
899 if (system($cmd)!=0)
900 {
901 print STDERR "Error executing $cmd: $!\n";
902 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
903 }
904
905 # make sure there is some extracted text.
906 if (-e "$output_filestem.text") {
907 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
908 binmode(EXTR_TEXT); # just in case...
909 my $line="";
910 my $seen_text=0;
911 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
912 if ($line=~ m/\w/) {$seen_text=1;}
913 }
914 close EXTR_TEXT;
915 if ($seen_text==0) { # no text was extracted
916 print STDERR "Error: pdftotext found no text\n";
917 &util::rm("$output_filestem.text");
918 }
919 }
920
921 # make sure the converter made something
922 if (! -s "$output_filestem.text")
923 {
924 # print out the converters std err, if any
925 if (-s "$output_filestem.err") {
926 open (ERRLOG, "$output_filestem.err") || die "$!";
927 print STDERR "pdftotext error log:\n";
928 while (<ERRLOG>) {
929 print STDERR "$_";
930 }
931 close ERRLOG;
932 }
933 # does this converter create a .out file?
934 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
935 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
936 if (-e "$output_filestem.err") {
937 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
938 {
939 open (ERRLOG,"$output_filestem.err");
940 while (<ERRLOG>) {print FAILLOG $_;}
941 close ERRLOG;
942 close FAILLOG;
943 }
944 &util::rm("$output_filestem.err");
945 }
946 return 0;
947 }
948 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
949 return 1;
950}
951
952# Convert a PostScript document to text
953# note - just using "ps2ascii" isn't good enough, as it
954# returns 0 for a postscript interpreter error. ps2ascii is just
955# a wrapper to "gs" anyway, so we use that cmd here.
956
957sub ps_to_text {
958 my ($input_filename, $output_filestem) = @_;
959
960 my $error = "";
961
962 # if we're on windows we'll fall straight through without attempting
963 # to use gs
964 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
965 $error = "Windows does not support gs";
966
967 } else {
968 my $cmd = "";
969 if ($timeout) {$cmd = "ulimit -t $timeout; ";}
970 $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
971 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
972 #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
973 $cmd .= " 2> $output_filestem.err";
974 $!=0;
975
976 my $retcode=system($cmd);
977 $retcode = $? >> 8; # see man perlfunc - system for this...
978 # if system returns -1 | 127 (couldn't start program), look at $! for message
979
980 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
981 elsif (! -e "$output_filestem.text") {
982 $error="did not create output file.\n";
983 }
984 else
985 { # make sure the interpreter didn't get an error. It is technically
986 # possible for the actual text to start with this, but....
987 open PSOUT, "$output_filestem.text";
988 if (<PSOUT> =~ m/^Error: (.*)/) {
989 $error="interpreter error - \"$1\"";
990 }
991 close PSOUT;
992 }
993 }
994
995 if ($error ne "")
996 {
997 print STDERR "Warning: Error executing gs: $error\n";
998 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
999
1000 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1001 {
1002 print FAILLOG "gs - $error\n";
1003 if (-e "$output_filestem.err") {
1004 open(ERRLOG, "$output_filestem.err");
1005 while (<ERRLOG>) {print FAILLOG $_;}
1006 close ERRLOG;
1007 }
1008 close FAILLOG;
1009 }
1010 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1011
1012
1013 # Fine then. We'll just do a lousy job by ourselves...
1014 # Based on 5-line regexp sed script found at:
1015 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1016 #
1017 print STDERR "Stripping text from postscript\n";
1018 my $errorcode=0;
1019 open (IN, "$input_filename")
1020 || ($errorcode=1, warn "Couldn't read file: $!");
1021 open (OUT, ">$output_filestem.text")
1022 || ($errorcode=1, warn "Couldn't write file: $!");
1023 if ($errorcode) {print STDERR "errors\n";return 0;}
1024
1025 my $text=""; # this is for whole .ps file...
1026 $text = join('', <IN>); # see man perlport, under "System Resources"
1027 close IN;
1028
1029 # Make sure this is a ps file...
1030 if ($text !~ m/^%!/) {
1031 print STDERR "Bad postscript header: not '%!'\n";
1032 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1033 {
1034 print FAILLOG "Bad postscript header: not '%!'\n";
1035 close FAILLOG;
1036 }
1037 return 0;
1038 }
1039
1040 # if ps has Page data, then use it to delete all stuff before it.
1041 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1042
1043 # remove all leading non-data stuff
1044 $text =~ s/^.*?\(//s;
1045
1046 # remove all newline chars for easier processing
1047 $text =~ s/\n//g;
1048
1049 # Big assumption here - assume that if any co-ordinates are
1050 # given, then we are at the end of a sentence.
1051 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1052
1053 # special characters--
1054 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1055
1056 # ? ps text formatting (eg italics?) ?
1057 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1058 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1059 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1060 # default - remove the rest
1061 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1062
1063 # attempt to add whitespace between words...
1064 # this is based purely on observation, and may be completely wrong...
1065 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1066 # eg I notice "b(" is sometimes NOT a space if preceded by a
1067 # negative number.
1068 $text =~ s/\)\d+ ?b\(/\) \( /g;
1069
1070 # change quoted braces to brackets
1071 $text =~ s/([^\\])\\\(/$1\{/g;
1072 $text =~ s/([^\\])\\\)/$1\}/g ;
1073
1074 # remove everything that is not between braces
1075 $text =~ s/\)([^\(\)])+?\(//sg ;
1076
1077 # remove any Trailer eof stuff.
1078 $text =~ s/\)[^\)]*$//sg;
1079
1080 ### ligatures have special characters...
1081 $text =~ s/\\013/ff/g;
1082 $text =~ s/\\014/fi/g;
1083 $text =~ s/\\015/fl/g;
1084 $text =~ s/\\016/ffi/g;
1085 $text =~ s/\\214/fi/g;
1086 $text =~ s/\\215/fl/g;
1087 $text =~ s/\\017/\n\* /g; # asterisk?
1088 $text =~ s/\\023/\023/g; # e acute ('e)
1089 $text =~ s/\\177/\252/g; # u"
1090# $text =~ s/ ?? /\344/g; # a"
1091
1092 print OUT "$text";
1093 close OUT;
1094 }
1095 # wrap the text - use a minimum length. ie, first space after this length.
1096 my $wrap_length=72;
1097 &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1098 open INFILE, "$output_filestem.text.tmp" ||
1099 die "Couldn't open file: $!";
1100 open OUTFILE, ">$output_filestem.text" ||
1101 die "Couldn't open file for writing: $!";
1102 my $line="";
1103 while ($line=<INFILE>) {
1104 while (length($line)>0) {
1105 if (length($line)>$wrap_length) {
1106 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1107 print OUTFILE "$1\n";
1108 } else {
1109 print OUTFILE "$line";
1110 $line="";
1111 }
1112 }
1113 }
1114 close INFILE;
1115 close OUTFILE;
1116 &util::rm("$output_filestem.text.tmp");
1117
1118 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1119 return 1;
1120}
1121
1122
1123# Convert any file to HTML with a crude perl implementation of the
1124# UNIX strings command.
1125
1126sub any_to_html {
1127 my ($input_filename, $output_filestem) = @_;
1128
1129 # First generate a text file
1130 return 0 unless (&any_to_text($input_filename, $output_filestem));
1131
1132 # create an HTML file from the text file
1133 open(TEXT, "<$output_filestem.text");
1134 open(HTML, ">$output_filestem.html");
1135
1136 print HTML "<html><head>\n";
1137 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1138 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1139 print HTML "</head><body>\n\n";
1140
1141 my $line;
1142 while ($line=<TEXT>) {
1143 $line =~ s/</&lt;/g;
1144 $line =~ s/>/&gt;/g;
1145 if ($line =~ m/^\s*$/) {
1146 print HTML "<p>";
1147 } else {
1148 print HTML "<br> ", $line;
1149 }
1150 }
1151 print HTML "\n</body></html>\n";
1152
1153 close HTML;
1154 close TEXT;
1155
1156 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1157 return 1;
1158}
1159
1160# Convert any file to TEXT with a crude perl implementation of the
1161# UNIX strings command.
1162# Note - this assumes ascii charsets :( (jrm21)
1163
1164sub any_to_text {
1165 my ($input_filename, $output_filestem) = @_;
1166
1167 if (!$use_strings) {
1168 return 0;
1169 }
1170
1171 print STDERR "\n**** In any to text****\n\n";
1172 open(IN, "<$input_filename") || return 0;
1173 binmode(IN);
1174 open(OUT, ">$output_filestem.text") || return 0;
1175
1176 my ($line);
1177 my $output_line_count = 0;
1178 while (<IN>) {
1179 $line = $_;
1180
1181 # delete anything that isn't a printable character
1182 $line =~ s/[^\040-\176]+/\n/sg;
1183
1184 # delete any string less than 10 characters long
1185 $line =~ s/^.{0,9}$/\n/mg;
1186 while ($line =~ m/^.{1,9}$/m) {
1187 $line =~ s/^.{0,9}$/\n/mg;
1188 $line =~ s/\n+/\n/sg;
1189 }
1190
1191 # remove extraneous whitespace
1192 $line =~ s/\n+/\n/gs;
1193 $line =~ s/^\n//gs;
1194
1195 # output whatever is left
1196 if ($line =~ m/[^\n ]/) {
1197 print OUT $line;
1198 ++$output_line_count;
1199 }
1200 }
1201
1202 close OUT;
1203 close IN;
1204
1205 if ($output_line_count) { # try to protect against binary only formats
1206 return 1;
1207 }
1208
1209 &util::rm("$output_filestem.text");
1210 return 0;
1211
1212}
Note: See TracBrowser for help on using the repository browser.