source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 24600

Last change on this file since 24600 was 24600, checked in by ak19, 13 years ago

Added gs-magick.pl script which will set the environment for ImageMagick (including LD_LIBRARY_PATH) before launching the requested ImageMagick command and arguments. By setting the Imagemagick environment from this script we ensure that the modified env variables don't create conflicts with libraries needed for normal linux execution. All the Greenstone files in the *binary* that made direct calls to imagemagick now go through this script. The affected files are perl files in bin/script and perllib and Gatherer.java of GLI. (wvware has files that test for imagemagick during compilation stage, which is independent of our changs which are only for users running imagemagick from a GS binary.) The final problems were related to how different perl files made use of the return values and the output of running their imagemagick command: they would query the 127 and/or and/or run the command with backtick operators to get the output printed to STDOUT. By inserting an intermediate gs-magick.pl file, needed to ensure that the exit code stored in 127 would at least be passed on correctly, as is necessary when testing the exit code against non-zero values or greater/less than zero (instead of comparing them with equals/not equal to 0). To get the correct exit code as emitted by imagemagick, calling code needs to shift bits in 127 and converting it to a signed value.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 35.7 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use strict;
52
53use parsargv;
54use util;
55use Cwd;
56
57# Are we running on WinNT or Win2000 (or later)?
58my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
59if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
60
61my $use_strings;
62my $pdf_complex;
63my $pdf_nohidden;
64my $pdf_zoom;
65my $pdf_ignore_images;
66my $pdf_allow_images_only;
67my $windows_scripting;
68
69sub print_usage
70{
71 print STDERR "\n";
72 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
73 print STDERR " or text using third-party programs.\n\n";
74 print STDERR " usage: $0 [options] filename\n";
75 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
76 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
77 print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
78 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
79 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
80 print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
81 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
82 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
83 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
84 print STDERR "\t\tconverting PDF to HTML\n";
85 print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
86 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
87 print STDERR "\t\t-pdf_complex is set\n";
88 exit(1);
89}
90
91my $faillogfile="";
92my $timeout=0;
93my $verbosity=0;
94
95sub main
96{
97 my (@ARGV) = @_;
98 my ($input_type,$output_type,$verbose);
99
100 # Dynamically figure out what the --type option can support, based on whether -windows_scripting
101 # is in use or not
102 my $default_type_re = "(doc|dot|pdf|ps|ppt|rtf|xls)";
103 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
104 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
105 # Currently only have VBA for Word and PPT(but no XLS)
106 my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xls)";
107
108 my $type_re = $default_type_re;
109
110 foreach my $a (@ARGV) {
111 if ($a =~ m/^windows_scripting$/i) {
112 $type_re = $enhanced_type_re;
113 }
114 }
115
116 # read command-line arguments
117 if (!parsargv::parse(\@ARGV,
118 "type/$type_re/", \$input_type,
119 '/errlog/.*/', \$faillogfile,
120 'output/(auto|html|text|pagedimg).*/', \$output_type,
121 'timeout/\d+/0',\$timeout,
122 'verbose/\d+/0', \$verbose,
123 'windows_scripting',\$windows_scripting,
124 'use_strings', \$use_strings,
125 'pdf_complex', \$pdf_complex,
126 'pdf_ignore_images', \$pdf_ignore_images,
127 'pdf_allow_images_only', \$pdf_allow_images_only,
128 'pdf_nohidden', \$pdf_nohidden,
129 'pdf_zoom/\d+/2', \$pdf_zoom
130 ))
131 {
132 print_usage();
133 }
134
135 $verbosity=$verbose if defined $verbose;
136
137 # Make sure the input file exists and can be opened for reading
138 if (scalar(@ARGV!=1)) {
139 print_usage();
140 }
141
142 my $input_filename = $ARGV[0];
143 if (!-r $input_filename) {
144 print STDERR "Error: unable to open $input_filename for reading\n";
145 exit(1);
146 }
147
148 # Deduce filenames
149 my ($tailname,$dirname,$suffix)
150 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
151 my $output_filestem = &util::filename_cat($dirname, "$tailname");
152
153 if ($input_type eq "")
154 {
155 $input_type = lc (substr($suffix,1,length($suffix)-1));
156 }
157
158 # Change to temporary working directory
159 my $stored_dir = cwd();
160 chdir ($dirname) || die "Unable to change to directory $dirname";
161
162 # Select convert utility
163 if (!defined $input_type) {
164 print STDERR "Error: No filename extension or input type defined\n";
165 exit(1);
166 }
167 elsif ($input_type =~ m/^docx?$/ || $input_type eq "dot") {
168 print &convertDOC($input_filename, $output_filestem, $output_type);
169 print "\n";
170 }
171 elsif ($input_type eq "rtf") {
172 print &convertRTF($input_filename, $output_filestem, $output_type);
173 print "\n";
174 }
175 elsif ($input_type eq "pdf") {
176 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
177 print "\n";
178 }
179 elsif ($input_type eq "ps") {
180 print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
181 print "\n";
182 }
183 elsif ($input_type =~ m/pptx?$/) {
184 print &convertPPT($input_filename, $output_filestem, $output_type);
185 print "\n";
186 }
187 elsif ($input_type =~ m/xlsx?$/) {
188 print &convertXLS($input_filename, $output_filestem, $output_type);
189 print "\n";
190 }
191 else {
192 print STDERR "Error: Unable to convert type '$input_type'\n";
193 exit(1);
194 }
195
196 # restore to original working directory
197 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
198
199}
200
201&main(@ARGV);
202
203
204
205# Document-type conversion functions
206#
207# The following functions attempt to convert documents from their
208# input type to the specified output type. If no output type was
209# given, then they first attempt HTML, and then TEXT.
210#
211# Each returns the output type ("html" or "text") or "fail" if no
212# conversion is possible.
213
214# Convert a Microsoft word document
215
216sub convertDOC {
217 my ($input_filename, $output_filestem, $output_type) = @_;
218
219 # Many .doc files are not in fact word documents!
220 my $realtype = &find_docfile_type($input_filename);
221
222 if ($realtype eq "word6" || $realtype eq "word7"
223 || $realtype eq "word8" || $realtype eq "docx") {
224 return &convertWord678($input_filename, $output_filestem, $output_type);
225 } elsif ($realtype eq "rtf") {
226 return &convertRTF($input_filename, $output_filestem, $output_type);
227 } else {
228 return &convertAnything($input_filename, $output_filestem, $output_type);
229 }
230}
231
232# Convert a Microsoft word 6/7/8 document
233
234sub convertWord678 {
235 my ($input_filename, $output_filestem, $output_type) = @_;
236
237 my $success = 0;
238 if (!$output_type || ($output_type =~ m/html/i)){
239 if ($windows_scripting) {
240 $success = &native_doc_to_html($input_filename, $output_filestem);
241 }
242 else {
243 $success = &doc_to_html($input_filename, $output_filestem);
244 }
245 if ($success) {
246 return "html";
247 }
248 }
249 return &convertAnything($input_filename, $output_filestem, $output_type);
250}
251
252
253# Convert a Rich Text Format (RTF) file
254
255sub convertRTF {
256 my ($input_filename, $output_filestem, $output_type) = @_;
257
258 my $success = 0;
259
260 # Attempt specialised conversion to HTML
261 if (!$output_type || ($output_type =~ m/html/i)) {
262
263 if ($windows_scripting) {
264 $success = &native_doc_to_html($input_filename, $output_filestem);
265 }
266 else {
267 $success = &rtf_to_html($input_filename, $output_filestem);
268 }
269 if ($success) {
270 return "html";
271 }
272 }
273
274# rtf is so ugly that's it's not worth running strings over.
275# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
276# return &convertAnything($input_filename, $output_filestem, $output_type);
277 return "fail";
278}
279
280
281# Convert an unidentified file
282
283sub convertAnything {
284 my ($input_filename, $output_filestem, $output_type) = @_;
285
286 my $success = 0;
287
288 # Attempt simple conversion to HTML
289 if (!$output_type || ($output_type =~ m/html/i)) {
290 $success = &any_to_html($input_filename, $output_filestem);
291 if ($success) {
292 return "html";
293 }
294 }
295
296 # Convert to text
297 if (!$output_type || ($output_type =~ m/text/i)) {
298 $success = &any_to_text($input_filename, $output_filestem);
299 if ($success) {
300 return "text";
301 }
302 }
303 return "fail";
304}
305
306
307
308# Convert an Adobe PDF document
309
310sub convertPDF {
311 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
312
313 my $success = 0;
314 $output_type =~ s/.*\-(.*)/$1/i;
315 # Attempt coversion to Image
316 if ($output_type =~ m/jp?g|gif|png/i) {
317 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
318 if ($success){
319 return "item";
320 }
321 }
322
323 # Attempt conversion to HTML
324 if (!$output_type || ($output_type =~ m/html/i)) {
325 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
326 if ($success) {
327 return "html";
328 }
329 }
330
331 # Attempt conversion to TEXT
332 if (!$output_type || ($output_type =~ m/text/i)) {
333 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
334 if ($success) {
335 return "text";
336 }
337 }
338
339 return "fail";
340
341}
342
343
344# Convert an Adobe PostScript document
345
346sub convertPS {
347 my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
348
349 my $success = 0;
350 $output_type =~ s/.*\-(.*)/$1/i;
351 # Attempt coversion to Image
352 if ($output_type =~ m/jp?g|gif|png/i) {
353 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
354 if ($success){
355 return "item";
356 }
357 }
358
359 # Attempt conversion to TEXT
360 if (!$output_type || ($output_type =~ m/text/i)) {
361 $success = &ps_to_text($input_filename, $output_filestem);
362 if ($success) {
363 return "text";
364 }
365 }
366 return "fail";
367}
368
369
370sub convertPPT {
371 my ($input_filename, $output_filestem, $output_type) = @_;
372 my $success = 0;
373
374 my $ppt_convert_type = "";
375
376 #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
377 if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
378 if ($output_type =~ m/gif/i) {
379 $ppt_convert_type = "-g";
380 } elsif ($output_type =~ m/jp?g/i){
381 $ppt_convert_type = "-j";
382 } elsif ($output_type =~ m/png/i){
383 $ppt_convert_type = "-p";
384 }
385 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
386 $ENV{'GSDLOS'}, "pptextract");
387 $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
388
389 my $cmd = "";
390 if ($timeout) {$cmd = "ulimit -t $timeout;";}
391 # if the converting directory already exists
392 if (-d $output_filestem) {
393 print STDERR "**The conversion directory already exists\n";
394 return "item";
395 } else {
396 $cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
397 $cmd .= " 2>\"$output_filestem.err\""
398 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
399 if (system($cmd) !=0) {
400 print STDERR "Powerpoint VB Scripting convert failed\n";
401 } else {
402 return "item";
403 }
404 }
405 } elsif (!$output_type || ($output_type =~ m/html/i)) {
406 # Attempt conversion to HTML
407 #if (!$output_type || ($output_type =~ m/html/i)) {
408 # formulate the command
409 my $cmd = "";
410 my $full_perl_path = &util::get_perl_exec();
411 $cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
412 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
413 $cmd .= " 2>\"$output_filestem.err\""
414 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
415
416 # execute the command
417 $!=0;
418 if (system($cmd)!=0)
419 {
420 print STDERR "Powerpoint 95/97 converter failed $!\n";
421 } else {
422 return "html";
423 }
424 }
425
426 $success = &any_to_text($input_filename, $output_filestem);
427 if ($success) {
428 return "text";
429 }
430
431 return "fail";
432}
433
434
435sub convertXLS {
436 my ($input_filename, $output_filestem, $output_type) = @_;
437
438 my $success = 0;
439
440 # Attempt conversion to HTML
441 if (!$output_type || ($output_type =~ m/html/i)) {
442 # formulate the command
443 my $cmd = "";
444 my $full_perl_path = &util::get_perl_exec();
445 $cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
446 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
447 $cmd .= " 2>\"$output_filestem.err\""
448 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
449
450
451 # execute the command
452 $!=0;
453 if (system($cmd)!=0)
454 {
455 print STDERR "Excel 95/97 converter failed $!\n";
456 } else {
457 return "html";
458 }
459 }
460
461 $success = &any_to_text($input_filename, $output_filestem);
462 if ($success) {
463 return "text";
464 }
465
466 return "fail";
467}
468
469
470
471# Find the real type of a .doc file
472#
473# We seem to have a lot of files with a .doc extension that are .rtf
474# files or Word 5 files. This function attempts to tell the difference.
475sub find_docfile_type {
476 my ($input_filename) = @_;
477
478 if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
479 return "docx";
480 }
481
482 open(CHK, "<$input_filename");
483 binmode(CHK);
484 my $line = "";
485 my $first = 1;
486
487 while (<CHK>) {
488
489 $line = $_;
490
491 if ($first) {
492 # check to see if this is an rtf file
493 if ($line =~ m/^\{\\rtf/) {
494 close(CHK);
495 return "rtf";
496 }
497 $first = 0;
498 }
499
500 # is this is a word 6/7/8 document?
501 if ($line =~ m/Word\.Document\.([678])/) {
502 close(CHK);
503
504 return "word$1";
505 }
506
507 }
508
509 return "unknown";
510}
511
512
513# Specific type-to-type conversions
514#
515# Each of the following functions attempts to convert a document from
516# a specific format to another. If they succeed they return 1 and leave
517# the output document(s) in the appropriate place; if they fail they
518# return 0 and delete any working files.
519
520
521# Attempt to convert a word document to html with the wv program
522sub doc_to_html {
523 my ($input_filename, $output_filestem) = @_;
524
525 my $wvware_status = 0;
526
527 # need to ensure that the path to perl is quoted (in case there's spaces in it)
528 my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
529
530# print STDERR "***** wvware launch cmd = $launch_cmd\n";
531
532 $wvware_status = system($launch_cmd)/256;
533 return $wvware_status;
534}
535
536# Attempt to convert a word document to html with the word2html scripting program
537sub native_doc_to_html {
538 my ($input_filename, $output_filestem) = @_;
539
540 # build up the path to the doc-to-html conversion tool we're going to use
541 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
542
543 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
544 # if windows scripting with docx input, use new VBscript to get the local Word install (if
545 # any) to do the conversion, since docX can't be processed by word2html's windows_scripting
546
547 if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
548 # else script launch fails when there are error msgs
549 $vbScript = &util::filename_cat($vbScript, "docx2html.vbs");
550 $vbScript = "CScript //Nologo \"$vbScript\""; # launche with CScript for error output in STDERR
551 # //Nologo flag avoids Microsoft's opening/logo msgs
552 print STDERR "About to use windows scripting to process docx file $input_filename.\n";
553 print STDERR " This may take some time. Please wait...\n";
554 }
555 else { # old doc versions. use the usual VB executable word2html for the
556 # conversion. Doesn't need full path, since bin\windows is on PATH
557 $vbScript = "word2html"; #$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
558 }
559 }
560 else { # not windows
561 $vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
562 }
563
564 if (-e "$output_filestem.html") {
565 print STDERR " The conversion file:\n";
566 print STDERR " $output_filestem.html\n";
567 print STDERR " ... already exists. Skipping\n";
568 return 1;
569 }
570
571 my $cmd = "";
572 if ($timeout) {$cmd = "ulimit -t $timeout;";}
573 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
574 #$cmd .= "$vbScript $input_filename $output_filestem.html";
575 $cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
576
577 # redirecting STDERR
578
579 $cmd .= " 2> \"$output_filestem.err\""
580 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
581 #print STDERR "@@@@@@@@@ cmd=$cmd\n";
582
583 # execute the command
584 $!=0;
585 if (system($cmd)!=0)
586 {
587 print STDERR "Error executing $vbScript converter:$!\n";
588 if (-s "$output_filestem.err") {
589 open (ERRFILE, "<$output_filestem.err");
590
591 my $write_to_fail_log=0;
592 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
593 {$write_to_fail_log=1;}
594
595 my $line;
596 while ($line=<ERRFILE>) {
597 if ($line =~ m/\w/) {
598 print STDERR "$line";
599 print FAILLOG "$line" if ($write_to_fail_log);
600 }
601 if ($line !~ m/startup error/) {next;}
602 print STDERR " (given an invalid .DOC file?)\n";
603 print FAILLOG " (given an invalid .DOC file?)\n"
604 if ($write_to_fail_log);
605
606 } # while ERRFILE
607 close FAILLOG if ($write_to_fail_log);
608 }
609 return 0; # we can try any_to_text
610 }
611
612 # Was the conversion successful?
613 if (-s "$output_filestem.html") {
614 open(TMP, "$output_filestem.html");
615 my $line = <TMP>;
616 close(TMP);
617 if ($line && $line =~ m/html/i) {
618 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
619 return 1;
620 }
621 }
622
623 # If here, an error of some sort occurred
624 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
625 if (-e "$output_filestem.err") {
626 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
627 open (ERRLOG,"$output_filestem.err");
628 while (<ERRLOG>) {print FAILLOG $_;}
629 close FAILLOG;
630 close ERRLOG;
631 }
632 &util::rm("$output_filestem.err");
633 }
634 return 0;
635}
636
637# Attempt to convert an RTF document to html with rtftohtml
638sub rtf_to_html {
639 my ($input_filename, $output_filestem) = @_;
640
641 # formulate the command
642 my $cmd = "";
643 if ($timeout) {$cmd = "ulimit -t $timeout;";}
644 $cmd .= "rtftohtml";
645 #$cmd .= "rtf-converter";
646
647 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
648
649 $cmd .= " 2>\"$output_filestem.err\""
650 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
651
652
653 # execute the command
654 $!=0;
655 if (system($cmd)!=0)
656 {
657 print STDERR "Error executing rtf converter $!\n";
658 # don't currently bother printing out error log...
659 # keep going, in case it still created an HTML file...
660 }
661
662 # Was the conversion successful?
663 my $was_successful=0;
664 if (-s "$output_filestem.html") {
665 # make sure we have some content other than header
666 open (HTML, "$output_filestem.html"); # what to do if fail?
667 my $line;
668 my $past_header=0;
669 while ($line=<HTML>) {
670
671 if ($past_header == 0) {
672 if ($line =~ m/<body>/) {$past_header=1;}
673 next;
674 }
675
676 $line =~ s/<[^>]+>//g;
677 if ($line =~ m/\w/ && $past_header) { # we found some content...
678 $was_successful=1;
679 last;
680 }
681 }
682 close HTML;
683 }
684
685 if ($was_successful) {
686 &util::rm("$output_filestem.err")
687 if (-e "$output_filestem.err");
688 # insert the (modified) table of contents, if it exists.
689 if (-e "${output_filestem}_ToC.html") {
690 &util::mv("$output_filestem.html","$output_filestem.src");
691 my $open_failed=0;
692 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
693 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
694 open HTML, ">$output_filestem.html" || ++$open_failed;
695
696 if ($open_failed) {
697 close HTMLSRC;
698 close TOC;
699 close HTML;
700 &util::mv("$output_filestem.src","$output_filestem.html");
701 return 1;
702 }
703
704 # print out header info from src html.
705 while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
706 print HTML "$_";
707 }
708
709 # print out table of contents, making links relative
710 <TOC>; <TOC>; # ignore first 2 lines
711 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
712 my $line;
713 while ($line=<TOC>) {
714 $line =~ s@</body></html>$@@i ; # only last line has this
715 # make link relative
716 $line =~ s@href=\"[^\#]+@href=\"@i;
717 print HTML $line;
718 }
719 close TOC;
720
721 # rest of html src
722 while (<HTMLSRC>) {
723 print HTML $_;
724 }
725 close HTMLSRC;
726 close HTML;
727
728 &util::rm("${output_filestem}_ToC.html");
729 &util::rm("${output_filestem}.src");
730 }
731 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
732 return 1; # success
733 }
734
735 if (-e "$output_filestem.err") {
736 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
737 {
738 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
739 #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
740 print FAILLOG " (rtf file might be too recent):\n";
741 open (ERRLOG, "$output_filestem.err");
742 while (<ERRLOG>) {print FAILLOG $_;}
743 close ERRLOG;
744 close FAILLOG;
745 }
746 &util::rm("$output_filestem.err");
747 }
748
749 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
750
751 return 0;
752}
753
754
755# Convert a pdf file to html with the pdftohtml command
756
757sub pdf_to_html {
758 my ($dirname, $input_filename, $output_filestem) = @_;
759
760 my $cmd = "";
761 if ($timeout) {$cmd = "ulimit -t $timeout;";}
762 my $full_perl_path = &util::get_perl_exec();
763 $cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
764 $cmd .= " -c" if ($pdf_complex);
765 $cmd .= " -i" if ($pdf_ignore_images);
766 $cmd .= " -a" if ($pdf_allow_images_only);
767 $cmd .= " -hidden" unless ($pdf_nohidden);
768 $cmd .= " \"$input_filename\" \"$output_filestem\"";
769
770 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
771 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
772 } else {
773 $cmd .= " > \"$output_filestem.err\"";
774 }
775
776 $!=0;
777
778 my $retval=system($cmd);
779 if ($retval!=0)
780 {
781 print STDERR "Error executing pdftohtml.pl";
782 if ($!) {print STDERR ": $!";}
783 print STDERR "\n";
784 }
785
786 # make sure the converter made something
787 if ($retval!=0 || ! -s "$output_filestem.html")
788 {
789 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
790 # print out the converter's std err, if any
791 if (-s "$output_filestem.err") {
792 open (ERRLOG, "$output_filestem.err") || die "$!";
793 print STDERR "pdftohtml error log:\n";
794 while (<ERRLOG>) {
795 print STDERR "$_";
796 }
797 close ERRLOG;
798 }
799 print STDERR "***********output filestem $output_filestem.html\n";
800 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
801 if (-e "$output_filestem.err") {
802 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
803 {
804 open (ERRLOG, "$output_filestem.err");
805 while (<ERRLOG>) {print FAILLOG $_;}
806 close ERRLOG;
807 close FAILLOG;
808 }
809 &util::rm("$output_filestem.err");
810 }
811 return 0;
812 }
813
814 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
815 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
816 return 1;
817}
818
819# Convert a pdf file to various types of image with the convert command
820
821sub pdfps_to_img {
822 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
823
824 # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
825 if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
826 my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
827 my $result = `$imagick_cmd identify 2>&1`;
828
829 # Linux and Windows return different values for "program not found".
830 # Linux returns -1 and Windows 256 for "program not found". But once they're
831 # converted to signed values, it will be -1 for Linux and 1 for Windows.
832 # Whenever we test for return values other than 0, shift by 8 and perform
833 # unsigned to signed status conversion on $? to get expected range of return vals
834 # Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
835 # and then exits on that, by the time we get here, we need to do it again
836 my $status = $?;
837 $status >>= 8;
838 $status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
839 if ($status == -1 || $status == 1) { #if ($status == -1 || $status == 256) {
840 #ImageMagick is not installed, thus the convert utility is not available.
841 print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
842 return 0;
843 }
844 }
845
846 my $cmd = "";
847 if ($timeout) {$cmd = "ulimit -t $timeout;";}
848 $output_type =~ s/.*\_(.*)/$1/i;
849 my $full_perl_path = &util::get_perl_exec();
850 $cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
851 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
852 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
853 } else {
854 $cmd .= " > \"$output_filestem.err\"";
855 }
856
857 # don't include path on windows (to avoid having to play about
858 # with quoting when GSDLHOME might contain spaces) but assume
859 # that the PATH is set up correctly
860 $!=0;
861 my $retval=system($cmd);
862 if ($retval!=0)
863 {
864 print STDERR "Error executing pdftoimg.pl";
865 if ($!) {print STDERR ": $!";}
866 print STDERR "\n";
867 }
868
869 #make sure the converter made something
870 #if ($retval !=0) || ! -s "$output_filestem")
871 if ($retval !=0)
872 {
873 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
874 #print out the converter's std err, if any
875 if (-s "$output_filestem.err") {
876 open (ERRLOG, "$output_filestem.err") || die "$!";
877 print STDERR "pdfpstoimg error log:\n";
878 while (<ERRLOG>) {
879 print STDERR "$_";
880 }
881 close ERRLOG;
882 }
883 #&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
884 if (-e "$output_filestem.err") {
885 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
886 {
887 open (ERRLOG, "$output_filestem.err");
888 while (<ERRLOG>) {print FAILLOG $_;}
889 close ERRLOG;
890 close FAILLOG;
891 }
892 &util::rm("$output_filestem.err");
893 }
894 return 0;
895 }
896 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
897 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
898 return 1;
899}
900
901# Convert a PDF file to text with the pdftotext command
902
903sub pdf_to_text {
904 my ($dirname, $input_filename, $output_filestem) = @_;
905
906 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
907
908 if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
909 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
910 } else {
911 $cmd .= " > \"$output_filestem.err\"";
912 }
913
914 if (system($cmd)!=0)
915 {
916 print STDERR "Error executing $cmd: $!\n";
917 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
918 }
919
920 # make sure there is some extracted text.
921 if (-e "$output_filestem.text") {
922 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
923 binmode(EXTR_TEXT); # just in case...
924 my $line="";
925 my $seen_text=0;
926 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
927 if ($line=~ m/\w/) {$seen_text=1;}
928 }
929 close EXTR_TEXT;
930 if ($seen_text==0) { # no text was extracted
931 print STDERR "Error: pdftotext found no text\n";
932 &util::rm("$output_filestem.text");
933 }
934 }
935
936 # make sure the converter made something
937 if (! -s "$output_filestem.text")
938 {
939 # print out the converters std err, if any
940 if (-s "$output_filestem.err") {
941 open (ERRLOG, "$output_filestem.err") || die "$!";
942 print STDERR "pdftotext error log:\n";
943 while (<ERRLOG>) {
944 print STDERR "$_";
945 }
946 close ERRLOG;
947 }
948 # does this converter create a .out file?
949 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
950 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
951 if (-e "$output_filestem.err") {
952 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
953 {
954 open (ERRLOG,"$output_filestem.err");
955 while (<ERRLOG>) {print FAILLOG $_;}
956 close ERRLOG;
957 close FAILLOG;
958 }
959 &util::rm("$output_filestem.err");
960 }
961 return 0;
962 }
963 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
964 return 1;
965}
966
967# Convert a PostScript document to text
968# note - just using "ps2ascii" isn't good enough, as it
969# returns 0 for a postscript interpreter error. ps2ascii is just
970# a wrapper to "gs" anyway, so we use that cmd here.
971
972sub ps_to_text {
973 my ($input_filename, $output_filestem) = @_;
974
975 my $error = "";
976
977 # if we're on windows we'll fall straight through without attempting
978 # to use gs
979 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
980 $error = "Windows does not support gs";
981
982 } else {
983 my $cmd = "";
984 if ($timeout) {$cmd = "ulimit -t $timeout; ";}
985 $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
986 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
987 #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
988 $cmd .= " 2> $output_filestem.err";
989 $!=0;
990
991 my $retcode=system($cmd);
992 $retcode = $? >> 8; # see man perlfunc - system for this...
993 # if system returns -1 | 127 (couldn't start program), look at $! for message
994
995 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
996 elsif (! -e "$output_filestem.text") {
997 $error="did not create output file.\n";
998 }
999 else
1000 { # make sure the interpreter didn't get an error. It is technically
1001 # possible for the actual text to start with this, but....
1002 open PSOUT, "$output_filestem.text";
1003 if (<PSOUT> =~ m/^Error: (.*)/) {
1004 $error="interpreter error - \"$1\"";
1005 }
1006 close PSOUT;
1007 }
1008 }
1009
1010 if ($error ne "")
1011 {
1012 print STDERR "Warning: Error executing gs: $error\n";
1013 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1014
1015 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1016 {
1017 print FAILLOG "gs - $error\n";
1018 if (-e "$output_filestem.err") {
1019 open(ERRLOG, "$output_filestem.err");
1020 while (<ERRLOG>) {print FAILLOG $_;}
1021 close ERRLOG;
1022 }
1023 close FAILLOG;
1024 }
1025 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1026
1027
1028 # Fine then. We'll just do a lousy job by ourselves...
1029 # Based on 5-line regexp sed script found at:
1030 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1031 #
1032 print STDERR "Stripping text from postscript\n";
1033 my $errorcode=0;
1034 open (IN, "$input_filename")
1035 || ($errorcode=1, warn "Couldn't read file: $!");
1036 open (OUT, ">$output_filestem.text")
1037 || ($errorcode=1, warn "Couldn't write file: $!");
1038 if ($errorcode) {print STDERR "errors\n";return 0;}
1039
1040 my $text=""; # this is for whole .ps file...
1041 $text = join('', <IN>); # see man perlport, under "System Resources"
1042 close IN;
1043
1044 # Make sure this is a ps file...
1045 if ($text !~ m/^%!/) {
1046 print STDERR "Bad postscript header: not '%!'\n";
1047 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1048 {
1049 print FAILLOG "Bad postscript header: not '%!'\n";
1050 close FAILLOG;
1051 }
1052 return 0;
1053 }
1054
1055 # if ps has Page data, then use it to delete all stuff before it.
1056 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1057
1058 # remove all leading non-data stuff
1059 $text =~ s/^.*?\(//s;
1060
1061 # remove all newline chars for easier processing
1062 $text =~ s/\n//g;
1063
1064 # Big assumption here - assume that if any co-ordinates are
1065 # given, then we are at the end of a sentence.
1066 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1067
1068 # special characters--
1069 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1070
1071 # ? ps text formatting (eg italics?) ?
1072 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1073 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1074 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1075 # default - remove the rest
1076 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1077
1078 # attempt to add whitespace between words...
1079 # this is based purely on observation, and may be completely wrong...
1080 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1081 # eg I notice "b(" is sometimes NOT a space if preceded by a
1082 # negative number.
1083 $text =~ s/\)\d+ ?b\(/\) \( /g;
1084
1085 # change quoted braces to brackets
1086 $text =~ s/([^\\])\\\(/$1\{/g;
1087 $text =~ s/([^\\])\\\)/$1\}/g ;
1088
1089 # remove everything that is not between braces
1090 $text =~ s/\)([^\(\)])+?\(//sg ;
1091
1092 # remove any Trailer eof stuff.
1093 $text =~ s/\)[^\)]*$//sg;
1094
1095 ### ligatures have special characters...
1096 $text =~ s/\\013/ff/g;
1097 $text =~ s/\\014/fi/g;
1098 $text =~ s/\\015/fl/g;
1099 $text =~ s/\\016/ffi/g;
1100 $text =~ s/\\214/fi/g;
1101 $text =~ s/\\215/fl/g;
1102 $text =~ s/\\017/\n\* /g; # asterisk?
1103 $text =~ s/\\023/\023/g; # e acute ('e)
1104 $text =~ s/\\177/\252/g; # u"
1105# $text =~ s/ ?? /\344/g; # a"
1106
1107 print OUT "$text";
1108 close OUT;
1109 }
1110 # wrap the text - use a minimum length. ie, first space after this length.
1111 my $wrap_length=72;
1112 &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1113 open INFILE, "$output_filestem.text.tmp" ||
1114 die "Couldn't open file: $!";
1115 open OUTFILE, ">$output_filestem.text" ||
1116 die "Couldn't open file for writing: $!";
1117 my $line="";
1118 while ($line=<INFILE>) {
1119 while (length($line)>0) {
1120 if (length($line)>$wrap_length) {
1121 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1122 print OUTFILE "$1\n";
1123 } else {
1124 print OUTFILE "$line";
1125 $line="";
1126 }
1127 }
1128 }
1129 close INFILE;
1130 close OUTFILE;
1131 &util::rm("$output_filestem.text.tmp");
1132
1133 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1134 return 1;
1135}
1136
1137
1138# Convert any file to HTML with a crude perl implementation of the
1139# UNIX strings command.
1140
1141sub any_to_html {
1142 my ($input_filename, $output_filestem) = @_;
1143
1144 # First generate a text file
1145 return 0 unless (&any_to_text($input_filename, $output_filestem));
1146
1147 # create an HTML file from the text file
1148 open(TEXT, "<$output_filestem.text");
1149 open(HTML, ">$output_filestem.html");
1150
1151 print HTML "<html><head>\n";
1152 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1153 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1154 print HTML "</head><body>\n\n";
1155
1156 my $line;
1157 while ($line=<TEXT>) {
1158 $line =~ s/</&lt;/g;
1159 $line =~ s/>/&gt;/g;
1160 if ($line =~ m/^\s*$/) {
1161 print HTML "<p>";
1162 } else {
1163 print HTML "<br> ", $line;
1164 }
1165 }
1166 print HTML "\n</body></html>\n";
1167
1168 close HTML;
1169 close TEXT;
1170
1171 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1172 return 1;
1173}
1174
1175# Convert any file to TEXT with a crude perl implementation of the
1176# UNIX strings command.
1177# Note - this assumes ascii charsets :( (jrm21)
1178
1179sub any_to_text {
1180 my ($input_filename, $output_filestem) = @_;
1181
1182 if (!$use_strings) {
1183 return 0;
1184 }
1185
1186 print STDERR "\n**** In any to text****\n\n";
1187 open(IN, "<$input_filename") || return 0;
1188 binmode(IN);
1189 open(OUT, ">$output_filestem.text") || return 0;
1190
1191 my ($line);
1192 my $output_line_count = 0;
1193 while (<IN>) {
1194 $line = $_;
1195
1196 # delete anything that isn't a printable character
1197 $line =~ s/[^\040-\176]+/\n/sg;
1198
1199 # delete any string less than 10 characters long
1200 $line =~ s/^.{0,9}$/\n/mg;
1201 while ($line =~ m/^.{1,9}$/m) {
1202 $line =~ s/^.{0,9}$/\n/mg;
1203 $line =~ s/\n+/\n/sg;
1204 }
1205
1206 # remove extraneous whitespace
1207 $line =~ s/\n+/\n/gs;
1208 $line =~ s/^\n//gs;
1209
1210 # output whatever is left
1211 if ($line =~ m/[^\n ]/) {
1212 print OUT $line;
1213 ++$output_line_count;
1214 }
1215 }
1216
1217 close OUT;
1218 close IN;
1219
1220 if ($output_line_count) { # try to protect against binary only formats
1221 return 1;
1222 }
1223
1224 &util::rm("$output_filestem.text");
1225 return 0;
1226
1227}
Note: See TracBrowser for help on using the repository browser.