source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 30681

Last change on this file since 30681 was 30681, checked in by ak19, 8 years ago

3 new strings introduced by Kathy contained the :, which is used as a separator in the properties file. Although Kathy tried to escape it with a backslash, it broke GTI because GTI doesn't recognise the backslash as a separator and all kinds of weird things happened from then on, so that the Gujarati translator kept having to translte the current date rather than a real GS3 interface string. Modified the gti.pl code (to be committed) and the new strings that Kathy introduced, so that hopefully, GTI can now handle it. Property names and values will be split at the right-most separator character now (= or :) and any on the left should not be escaped.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 36.6 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use strict;
52
53use parsargv;
54use util;
55use FileUtils;
56use Cwd;
57
58# Are we running on WinNT or Win2000 (or later)?
59my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62my $use_strings;
63my $pdf_complex;
64my $pdf_nohidden;
65my $pdf_zoom;
66my $pdf_ignore_images;
67my $pdf_allow_images_only;
68my $windows_scripting;
69
70sub print_usage
71{
72 print STDERR "\n";
73 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
74 print STDERR " or text using third-party programs.\n\n";
75 print STDERR " usage: $0 [options] filename\n";
76 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
77 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
78 print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
79 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
80 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
81 print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
82 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
83 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
84 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
85 print STDERR "\t\tconverting PDF to HTML\n";
86 print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
87 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
88 print STDERR "\t\t-pdf_complex is set\n";
89 exit(1);
90}
91
92my $faillogfile="";
93my $timeout=0;
94my $verbosity=0;
95
96sub main
97{
98 my (@ARGV) = @_;
99 my ($input_type,$output_type,$verbose);
100
101 # Dynamically figure out what the --type option can support, based on whether -windows_scripting
102 # is in use or not
103 my $default_type_re = "(doc|dot|pdf|ps|ppt|rtf|xls)";
104 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
105 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
106 # Currently only have VBA for Word and PPT(but no XLS)
107 my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xls)";
108
109 my $type_re = $default_type_re;
110
111 foreach my $a (@ARGV) {
112 if ($a =~ m/^windows_scripting$/i) {
113 $type_re = $enhanced_type_re;
114 }
115 }
116
117 # read command-line arguments
118 if (!parsargv::parse(\@ARGV,
119 "type/$type_re/", \$input_type,
120 '/errlog/.*/', \$faillogfile,
121 'output/(auto|html|text|pagedimg).*/', \$output_type,
122 'timeout/\d+/0',\$timeout,
123 'verbose/\d+/0', \$verbose,
124 'windows_scripting',\$windows_scripting,
125 'use_strings', \$use_strings,
126 'pdf_complex', \$pdf_complex,
127 'pdf_ignore_images', \$pdf_ignore_images,
128 'pdf_allow_images_only', \$pdf_allow_images_only,
129 'pdf_nohidden', \$pdf_nohidden,
130 'pdf_zoom/\d+/2', \$pdf_zoom
131 ))
132 {
133 print_usage();
134 }
135
136 $verbosity=$verbose if defined $verbose;
137
138 # Make sure the input file exists and can be opened for reading
139 if (scalar(@ARGV!=1)) {
140 print_usage();
141 }
142
143 my $input_filename = $ARGV[0];
144 if (!-r $input_filename) {
145 print STDERR "Error: unable to open $input_filename for reading\n";
146 exit(1);
147 }
148
149 # Deduce filenames
150 my ($tailname,$dirname,$suffix)
151 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
152 my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
153
154 if ($input_type eq "")
155 {
156 $input_type = lc (substr($suffix,1,length($suffix)-1));
157 }
158
159 # Change to temporary working directory
160 my $stored_dir = cwd();
161 chdir ($dirname) || die "Unable to change to directory $dirname";
162
163 # Select convert utility
164 if (!defined $input_type) {
165 print STDERR "Error: No filename extension or input type defined\n";
166 exit(1);
167 }
168 elsif ($input_type =~ m/^docx?$/ || $input_type eq "dot") {
169 print &convertDOC($input_filename, $output_filestem, $output_type);
170 print "\n";
171 }
172 elsif ($input_type eq "rtf") {
173 print &convertRTF($input_filename, $output_filestem, $output_type);
174 print "\n";
175 }
176 elsif ($input_type eq "pdf") {
177 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
178 print "\n";
179 }
180 elsif ($input_type eq "ps") {
181 print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
182 print "\n";
183 }
184 elsif ($input_type =~ m/pptx?$/) {
185 print &convertPPT($input_filename, $output_filestem, $output_type);
186 print "\n";
187 }
188 elsif ($input_type =~ m/xlsx?$/) {
189 print &convertXLS($input_filename, $output_filestem, $output_type);
190 print "\n";
191 }
192 else {
193 print STDERR "Error: Unable to convert type '$input_type'\n";
194 exit(1);
195 }
196
197 # restore to original working directory
198 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
199
200}
201
202&main(@ARGV);
203
204
205
206# Document-type conversion functions
207#
208# The following functions attempt to convert documents from their
209# input type to the specified output type. If no output type was
210# given, then they first attempt HTML, and then TEXT.
211#
212# Each returns the output type ("html" or "text") or "fail" if no
213# conversion is possible.
214
215# Convert a Microsoft word document
216
217sub convertDOC {
218 my ($input_filename, $output_filestem, $output_type) = @_;
219
220 # Many .doc files are not in fact word documents!
221 my $realtype = &find_docfile_type($input_filename);
222
223 if ($realtype eq "word6" || $realtype eq "word7"
224 || $realtype eq "word8" || $realtype eq "docx") {
225 return &convertWord678($input_filename, $output_filestem, $output_type);
226 } elsif ($realtype eq "rtf") {
227 return &convertRTF($input_filename, $output_filestem, $output_type);
228 } else {
229 return &convertAnything($input_filename, $output_filestem, $output_type);
230 }
231}
232
233# Convert a Microsoft word 6/7/8 document
234
235sub convertWord678 {
236 my ($input_filename, $output_filestem, $output_type) = @_;
237
238 my $success = 0;
239 if (!$output_type || ($output_type =~ m/html/i)){
240 if ($windows_scripting) {
241 $success = &native_doc_to_html($input_filename, $output_filestem);
242 }
243 else {
244 $success = &doc_to_html($input_filename, $output_filestem);
245 }
246 if ($success) {
247 return "html";
248 }
249 }
250 return &convertAnything($input_filename, $output_filestem, $output_type);
251}
252
253
254# Convert a Rich Text Format (RTF) file
255
256sub convertRTF {
257 my ($input_filename, $output_filestem, $output_type) = @_;
258
259 my $success = 0;
260
261 # Attempt specialised conversion to HTML
262 if (!$output_type || ($output_type =~ m/html/i)) {
263
264 if ($windows_scripting) {
265 $success = &native_doc_to_html($input_filename, $output_filestem);
266 }
267 else {
268 $success = &rtf_to_html($input_filename, $output_filestem);
269 }
270 if ($success) {
271 return "html";
272 }
273 }
274
275# rtf is so ugly that's it's not worth running strings over.
276# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
277# return &convertAnything($input_filename, $output_filestem, $output_type);
278 return "fail";
279}
280
281
282# Convert an unidentified file
283
284sub convertAnything {
285 my ($input_filename, $output_filestem, $output_type) = @_;
286
287 my $success = 0;
288
289 # Attempt simple conversion to HTML
290 if (!$output_type || ($output_type =~ m/html/i)) {
291 $success = &any_to_html($input_filename, $output_filestem);
292 if ($success) {
293 return "html";
294 }
295 }
296
297 # Convert to text
298 if (!$output_type || ($output_type =~ m/text/i)) {
299 $success = &any_to_text($input_filename, $output_filestem);
300 if ($success) {
301 return "text";
302 }
303 }
304 return "fail";
305}
306
307
308
309# Convert an Adobe PDF document
310
311sub convertPDF {
312 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
313
314 my $success = 0;
315 $output_type =~ s/.*\-(.*)/$1/i;
316 # Attempt coversion to Image
317 if ($output_type =~ m/jp?g|gif|png/i) {
318 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
319 if ($success){
320 return "item";
321 }
322 }
323
324 # Attempt conversion to HTML
325 if (!$output_type || ($output_type =~ m/html/i)) {
326 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
327 if ($success) {
328 return "html";
329 }
330 }
331
332 # Attempt conversion to TEXT
333 if (!$output_type || ($output_type =~ m/text/i)) {
334 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
335 if ($success) {
336 return "text";
337 }
338 }
339
340 return "fail";
341
342}
343
344
345# Convert an Adobe PostScript document
346
347sub convertPS {
348 my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
349
350 my $success = 0;
351 $output_type =~ s/.*\-(.*)/$1/i;
352 # Attempt coversion to Image
353 if ($output_type =~ m/jp?g|gif|png/i) {
354 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
355 if ($success){
356 return "item";
357 }
358 }
359
360 # Attempt conversion to TEXT
361 if (!$output_type || ($output_type =~ m/text/i)) {
362 $success = &ps_to_text($input_filename, $output_filestem);
363 if ($success) {
364 return "text";
365 }
366 }
367 return "fail";
368}
369
370
371sub convertPPT {
372 my ($input_filename, $output_filestem, $output_type) = @_;
373 my $success = 0;
374
375 my $ppt_convert_type = "";
376
377 #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
378 if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
379 if ($output_type =~ m/gif/i) {
380 $ppt_convert_type = "-g";
381 } elsif ($output_type =~ m/jp?g/i){
382 $ppt_convert_type = "-j";
383 } elsif ($output_type =~ m/png/i){
384 $ppt_convert_type = "-p";
385 }
386 my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
387 $ENV{'GSDLOS'}, "pptextract");
388 $vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
389 # $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
390
391 my $cmd = "";
392 if ($timeout) {$cmd = "ulimit -t $timeout;";}
393 # if the converting directory already exists
394 if (-d $output_filestem) {
395 print STDERR "**The conversion directory already exists\n";
396 return "item";
397 } else {
398 $cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
399 $cmd .= " 2>\"$output_filestem.err\""
400 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
401
402 if (system($cmd) !=0) {
403 print STDERR "Powerpoint VB Scripting convert failed\n";
404 } else {
405 return "item";
406 }
407 }
408 } elsif (!$output_type || ($output_type =~ m/html/i)) {
409 # Attempt conversion to HTML
410 #if (!$output_type || ($output_type =~ m/html/i)) {
411 # formulate the command
412 my $cmd = "";
413 my $full_perl_path = &util::get_perl_exec();
414 $cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
415 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
416 $cmd .= " 2>\"$output_filestem.err\""
417 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
418
419 # execute the command
420 $!=0;
421 if (system($cmd)!=0)
422 {
423 print STDERR "Powerpoint 95/97 converter failed $!\n";
424 } else {
425 return "html";
426 }
427 }
428
429 $success = &any_to_text($input_filename, $output_filestem);
430 if ($success) {
431 return "text";
432 }
433
434 return "fail";
435}
436
437
438sub convertXLS {
439 my ($input_filename, $output_filestem, $output_type) = @_;
440
441 my $success = 0;
442
443 # Attempt conversion to HTML
444 if (!$output_type || ($output_type =~ m/html/i)) {
445 # formulate the command
446 my $cmd = "";
447 my $full_perl_path = &util::get_perl_exec();
448 $cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
449 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
450 $cmd .= " 2>\"$output_filestem.err\""
451 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
452
453
454 # execute the command
455 $!=0;
456 if (system($cmd)!=0)
457 {
458 print STDERR "Excel 95/97 converter failed $!\n";
459 } else {
460 return "html";
461 }
462 }
463
464 $success = &any_to_text($input_filename, $output_filestem);
465 if ($success) {
466 return "text";
467 }
468
469 return "fail";
470}
471
472
473
474# Find the real type of a .doc file
475#
476# We seem to have a lot of files with a .doc extension that are .rtf
477# files or Word 5 files. This function attempts to tell the difference.
478sub find_docfile_type {
479 my ($input_filename) = @_;
480
481 if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
482 return "docx";
483 }
484
485 open(CHK, "<$input_filename");
486 binmode(CHK);
487 my $line = "";
488 my $first = 1;
489
490 while (<CHK>) {
491
492 $line = $_;
493
494 if ($first) {
495 # check to see if this is an rtf file
496 if ($line =~ m/^\{\\rtf/) {
497 close(CHK);
498 return "rtf";
499 }
500 $first = 0;
501 }
502
503 # is this is a word 6/7/8 document?
504 if ($line =~ m/Word\.Document\.([678])/) {
505 close(CHK);
506
507 return "word$1";
508 }
509
510 }
511
512 return "unknown";
513}
514
515
516# Specific type-to-type conversions
517#
518# Each of the following functions attempts to convert a document from
519# a specific format to another. If they succeed they return 1 and leave
520# the output document(s) in the appropriate place; if they fail they
521# return 0 and delete any working files.
522
523
524# Attempt to convert a word document to html with the wv program
525sub doc_to_html {
526 my ($input_filename, $output_filestem) = @_;
527
528 my $wvware_status = 0;
529
530 # need to ensure that the path to perl is quoted (in case there's spaces in it)
531 my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
532
533 print STDERR "***** wvware launch cmd = $launch_cmd\n";
534
535 $wvware_status = system($launch_cmd)/256;
536 return $wvware_status;
537}
538
539# Attempt to convert a word document to html with the word2html scripting program
540sub native_doc_to_html {
541 my ($input_filename, $output_filestem) = @_;
542
543 # build up the path to the doc-to-html conversion tool we're going to use
544 my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
545
546 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
547 # if windows scripting with docx input, use new VBscript to get the local Word install (if
548 # any) to do the conversion, since docX can't be processed by word2html's windows_scripting
549
550 if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
551 # else script launch fails when there are error msgs
552 $vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
553 $vbScript = "CScript //Nologo \"$vbScript\""; # launch with CScript for error output in STDERR
554 # //Nologo flag avoids Microsoft's opening/logo msgs
555 print STDERR "About to use windows scripting to process docx file $input_filename.\n";
556 print STDERR " This may take some time. Please wait...\n";
557 }
558 else { # old doc versions. use the usual VB executable word2html for the
559 # conversion. Doesn't need full path, since bin\windows is on PATH
560 $vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
561 }
562 }
563 else { # not windows
564 $vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
565 }
566
567 if (-e "$output_filestem.html") {
568 print STDERR " The conversion file:\n";
569 print STDERR " $output_filestem.html\n";
570 print STDERR " ... already exists. Skipping\n";
571 return 1;
572 }
573
574 my $cmd = "";
575 if ($timeout) {$cmd = "ulimit -t $timeout;";}
576 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
577 #$cmd .= "$vbScript $input_filename $output_filestem.html";
578 $cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
579
580 # redirecting STDERR
581
582 $cmd .= " 2> \"$output_filestem.err\""
583 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
584 #print STDERR "@@@@@@@@@ cmd=$cmd\n";
585
586 # execute the command
587 $!=0;
588 if (system($cmd)!=0)
589 {
590 print STDERR "Error executing $vbScript converter:$!\n";
591 if (-s "$output_filestem.err") {
592 open (ERRFILE, "<$output_filestem.err");
593
594 my $write_to_fail_log=0;
595 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
596 {$write_to_fail_log=1;}
597
598 my $line;
599 while ($line=<ERRFILE>) {
600 if ($line =~ m/\w/) {
601 print STDERR "$line";
602 print FAILLOG "$line" if ($write_to_fail_log);
603 }
604 if ($line !~ m/startup error/) {next;}
605 print STDERR " (given an invalid .DOC file?)\n";
606 print FAILLOG " (given an invalid .DOC file?)\n"
607 if ($write_to_fail_log);
608
609 } # while ERRFILE
610 close FAILLOG if ($write_to_fail_log);
611 }
612 return 0; # we can try any_to_text
613 }
614
615 # Was the conversion successful?
616 if (-s "$output_filestem.html") {
617 open(TMP, "$output_filestem.html");
618 my $line = <TMP>;
619 close(TMP);
620 if ($line && $line =~ m/html/i) {
621 &FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
622 return 1;
623 }
624 }
625
626 # If here, an error of some sort occurred
627 &FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
628 if (-e "$output_filestem.err") {
629 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
630 open (ERRLOG,"$output_filestem.err");
631 while (<ERRLOG>) {print FAILLOG $_;}
632 close FAILLOG;
633 close ERRLOG;
634 }
635 &FileUtils::removeFiles("$output_filestem.err");
636 }
637 return 0;
638}
639
640# Attempt to convert an RTF document to html with rtftohtml
641sub rtf_to_html {
642 my ($input_filename, $output_filestem) = @_;
643
644 # formulate the command
645 my $cmd = "";
646 if ($timeout) {$cmd = "ulimit -t $timeout;";}
647 $cmd .= "rtftohtml";
648 #$cmd .= "rtf-converter";
649
650 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
651
652 $cmd .= " 2>\"$output_filestem.err\""
653 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
654
655
656 # execute the command
657 $!=0;
658 if (system($cmd)!=0)
659 {
660 print STDERR "Error executing rtf converter $!\n";
661 # don't currently bother printing out error log...
662 # keep going, in case it still created an HTML file...
663 }
664
665 # Was the conversion successful?
666 my $was_successful=0;
667 if (-s "$output_filestem.html") {
668 # make sure we have some content other than header
669 open (HTML, "$output_filestem.html"); # what to do if fail?
670 my $line;
671 my $past_header=0;
672 while ($line=<HTML>) {
673
674 if ($past_header == 0) {
675 if ($line =~ m/<body>/) {$past_header=1;}
676 next;
677 }
678
679 $line =~ s/<[^>]+>//g;
680 if ($line =~ m/\w/ && $past_header) { # we found some content...
681 $was_successful=1;
682 last;
683 }
684 }
685 close HTML;
686 }
687
688 if ($was_successful) {
689 &FileUtils::removeFiles("$output_filestem.err")
690 if (-e "$output_filestem.err");
691 # insert the (modified) table of contents, if it exists.
692 if (-e "${output_filestem}_ToC.html") {
693 &FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
694 my $open_failed=0;
695 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
696 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
697 open HTML, ">$output_filestem.html" || ++$open_failed;
698
699 if ($open_failed) {
700 close HTMLSRC;
701 close TOC;
702 close HTML;
703 &FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
704 return 1;
705 }
706
707 # print out header info from src html.
708 while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
709 print HTML "$_";
710 }
711
712 # print out table of contents, making links relative
713 <TOC>; <TOC>; # ignore first 2 lines
714 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
715 my $line;
716 while ($line=<TOC>) {
717 $line =~ s@</body></html>$@@i ; # only last line has this
718 # make link relative
719 $line =~ s@href=\"[^\#]+@href=\"@i;
720 print HTML $line;
721 }
722 close TOC;
723
724 # rest of html src
725 while (<HTMLSRC>) {
726 print HTML $_;
727 }
728 close HTMLSRC;
729 close HTML;
730
731 &FileUtils::removeFiles("${output_filestem}_ToC.html");
732 &FileUtils::removeFiles("${output_filestem}.src");
733 }
734 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
735 return 1; # success
736 }
737
738 if (-e "$output_filestem.err") {
739 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
740 {
741 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
742 #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
743 print FAILLOG " (rtf file might be too recent):\n";
744 open (ERRLOG, "$output_filestem.err");
745 while (<ERRLOG>) {print FAILLOG $_;}
746 close ERRLOG;
747 close FAILLOG;
748 }
749 &FileUtils::removeFiles("$output_filestem.err");
750 }
751
752 &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
753
754 return 0;
755}
756
757
758# Convert a pdf file to html with the pdftohtml command
759
760sub pdf_to_html {
761 my ($dirname, $input_filename, $output_filestem) = @_;
762
763 my $cmd = "";
764 if ($timeout) {$cmd = "ulimit -t $timeout;";}
765 my $full_perl_path = &util::get_perl_exec();
766 $cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
767 $cmd .= " -c" if ($pdf_complex);
768 $cmd .= " -i" if ($pdf_ignore_images);
769 $cmd .= " -a" if ($pdf_allow_images_only);
770 $cmd .= " -hidden" unless ($pdf_nohidden);
771 $cmd .= " \"$input_filename\" \"$output_filestem\"";
772
773 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
774 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
775 } else {
776 $cmd .= " > \"$output_filestem.err\"";
777 }
778
779 $!=0;
780
781 my $retval=system($cmd);
782 if ($retval!=0)
783 {
784 print STDERR "Error executing pdftohtml.pl";
785 if ($!) {print STDERR ": $!";}
786 print STDERR "\n";
787 }
788
789 # make sure the converter made something
790 if ($retval!=0 || ! -s "$output_filestem.html")
791 {
792 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
793 # print out the converter's std err, if any
794 if (-s "$output_filestem.err") {
795 open (ERRLOG, "$output_filestem.err") || die "$!";
796 print STDERR "pdftohtml error log:\n";
797 while (<ERRLOG>) {
798 print STDERR "$_";
799 }
800 close ERRLOG;
801 }
802 #print STDERR "***********output filestem $output_filestem.html\n";
803 &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
804 if (-e "$output_filestem.err") {
805 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
806 {
807 open (ERRLOG, "$output_filestem.err");
808 while (<ERRLOG>) {print FAILLOG $_;}
809 close ERRLOG;
810 close FAILLOG;
811 }
812 &FileUtils::removeFiles("$output_filestem.err");
813 }
814 return 0;
815 }
816
817 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
818 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
819 return 1;
820}
821
822# Convert a pdf file to various types of image with the convert command
823
824sub pdfps_to_img {
825 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
826
827 # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
828 if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
829 my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
830 $imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
831 my $result = `$imagick_cmd identify 2>&1`;
832
833 # Linux and Windows return different values for "program not found".
834 # Linux returns -1 and Windows 256 for "program not found". But once they're
835 # converted to signed values, it will be -1 for Linux and 1 for Windows.
836 # Whenever we test for return values other than 0, shift by 8 and perform
837 # unsigned to signed status conversion on $? to get expected range of return vals
838 # Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
839 # and then exits on that, by the time we get here, we need to do it again
840 my $status = $?;
841 $status >>= 8;
842 $status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
843 if (($ENV{'GSDLOS'} ne "windows" && $status == -1) || ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
844 # if ($status == -1 || $status == 1) #if ($status == -1 || $status == 256) {
845 #ImageMagick is not installed, thus the convert utility is not available.
846 print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
847 return 0;
848 }
849 }
850
851 my $cmd = "";
852 if ($timeout) {$cmd = "ulimit -t $timeout;";}
853 $output_type =~ s/.*\_(.*)/$1/i;
854 my $full_perl_path = &util::get_perl_exec();
855 $cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
856 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
857 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
858 } else {
859 $cmd .= " > \"$output_filestem.err\"";
860 }
861
862 # don't include path on windows (to avoid having to play about
863 # with quoting when GSDLHOME might contain spaces) but assume
864 # that the PATH is set up correctly
865 $!=0;
866 my $retval=system($cmd);
867 if ($retval!=0)
868 {
869 print STDERR "Error executing pdfpstoimg.pl";
870 if ($!) {print STDERR ": $!";}
871 print STDERR "\n";
872 }
873
874 #make sure the converter made something
875 #if ($retval !=0) || ! -s "$output_filestem")
876 if ($retval !=0)
877 {
878 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
879 #print out the converter's std err, if any
880 if (-s "$output_filestem.err") {
881 open (ERRLOG, "$output_filestem.err") || die "$!";
882 print STDERR "pdfpstoimg error log:\n";
883 while (<ERRLOG>) {
884 print STDERR "$_";
885 }
886 close ERRLOG;
887 }
888 #&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
889 if (-e "$output_filestem.err") {
890 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
891 {
892 open (ERRLOG, "$output_filestem.err");
893 while (<ERRLOG>) {print FAILLOG $_;}
894 close ERRLOG;
895 close FAILLOG;
896 }
897 &FileUtils::removeFiles("$output_filestem.err");
898 }
899 return 0;
900 }
901 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
902 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
903 return 1;
904}
905
906# Convert a PDF file to text with the pdftotext command
907
908sub pdf_to_text {
909 my ($dirname, $input_filename, $output_filestem) = @_;
910
911 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
912
913 if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
914 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
915 } else {
916 $cmd .= " > \"$output_filestem.err\"";
917 }
918
919 if (system($cmd)!=0)
920 {
921 print STDERR "Error executing $cmd: $!\n";
922 &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
923 }
924
925 # make sure there is some extracted text.
926 if (-e "$output_filestem.text") {
927 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
928 binmode(EXTR_TEXT); # just in case...
929 my $line="";
930 my $seen_text=0;
931 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
932 if ($line=~ m/\w/) {$seen_text=1;}
933 }
934 close EXTR_TEXT;
935 if ($seen_text==0) { # no text was extracted
936 print STDERR "Error: pdftotext found no text\n";
937 &FileUtils::removeFiles("$output_filestem.text");
938 }
939 }
940
941 # make sure the converter made something
942 if (! -s "$output_filestem.text")
943 {
944 # print out the converters std err, if any
945 if (-s "$output_filestem.err") {
946 open (ERRLOG, "$output_filestem.err") || die "$!";
947 print STDERR "pdftotext error log:\n";
948 while (<ERRLOG>) {
949 print STDERR "$_";
950 }
951 close ERRLOG;
952 }
953 # does this converter create a .out file?
954 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
955 &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
956 if (-e "$output_filestem.err") {
957 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
958 {
959 open (ERRLOG,"$output_filestem.err");
960 while (<ERRLOG>) {print FAILLOG $_;}
961 close ERRLOG;
962 close FAILLOG;
963 }
964 &FileUtils::removeFiles("$output_filestem.err");
965 }
966 return 0;
967 }
968 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
969 return 1;
970}
971
972# Convert a PostScript document to text
973# note - just using "ps2ascii" isn't good enough, as it
974# returns 0 for a postscript interpreter error. ps2ascii is just
975# a wrapper to "gs" anyway, so we use that cmd here.
976
977sub ps_to_text {
978 my ($input_filename, $output_filestem) = @_;
979
980 my $error = "";
981
982 # if we're on windows we'll fall straight through without attempting
983 # to use gs
984 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
985 $error = "Windows does not support gs";
986
987 } else {
988 my $cmd = "";
989 if ($timeout) {$cmd = "ulimit -t $timeout; ";}
990 $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
991 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
992 #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
993 $cmd .= " 2> $output_filestem.err";
994 $!=0;
995
996 my $retcode=system($cmd);
997 $retcode = $? >> 8; # see man perlfunc - system for this...
998 # if system returns -1 | 127 (couldn't start program), look at $! for message
999
1000 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1001 elsif (! -e "$output_filestem.text") {
1002 $error="did not create output file.\n";
1003 }
1004 else
1005 { # make sure the interpreter didn't get an error. It is technically
1006 # possible for the actual text to start with this, but....
1007 open PSOUT, "$output_filestem.text";
1008 if (<PSOUT> =~ m/^Error: (.*)/) {
1009 $error="interpreter error - \"$1\"";
1010 }
1011 close PSOUT;
1012 }
1013 }
1014
1015 if ($error ne "")
1016 {
1017 print STDERR "Warning: Error executing gs: $error\n";
1018 &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1019
1020 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1021 {
1022 print FAILLOG "gs - $error\n";
1023 if (-e "$output_filestem.err") {
1024 open(ERRLOG, "$output_filestem.err");
1025 while (<ERRLOG>) {print FAILLOG $_;}
1026 close ERRLOG;
1027 }
1028 close FAILLOG;
1029 }
1030 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1031
1032
1033 # Fine then. We'll just do a lousy job by ourselves...
1034 # Based on 5-line regexp sed script found at:
1035 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1036 #
1037 print STDERR "Stripping text from postscript\n";
1038 my $errorcode=0;
1039 open (IN, "$input_filename")
1040 || ($errorcode=1, warn "Couldn't read file: $!");
1041 open (OUT, ">$output_filestem.text")
1042 || ($errorcode=1, warn "Couldn't write file: $!");
1043 if ($errorcode) {print STDERR "errors\n";return 0;}
1044
1045 my $text=""; # this is for whole .ps file...
1046 $text = join('', <IN>); # see man perlport, under "System Resources"
1047 close IN;
1048
1049 # Make sure this is a ps file...
1050 if ($text !~ m/^%!/) {
1051 print STDERR "Bad postscript header: not '%!'\n";
1052 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1053 {
1054 print FAILLOG "Bad postscript header: not '%!'\n";
1055 close FAILLOG;
1056 }
1057 return 0;
1058 }
1059
1060 # if ps has Page data, then use it to delete all stuff before it.
1061 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1062
1063 # remove all leading non-data stuff
1064 $text =~ s/^.*?\(//s;
1065
1066 # remove all newline chars for easier processing
1067 $text =~ s/\n//g;
1068
1069 # Big assumption here - assume that if any co-ordinates are
1070 # given, then we are at the end of a sentence.
1071 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1072
1073 # special characters--
1074 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1075
1076 # ? ps text formatting (eg italics?) ?
1077 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1078 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1079 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1080 # default - remove the rest
1081 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1082
1083 # attempt to add whitespace between words...
1084 # this is based purely on observation, and may be completely wrong...
1085 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1086 # eg I notice "b(" is sometimes NOT a space if preceded by a
1087 # negative number.
1088 $text =~ s/\)\d+ ?b\(/\) \( /g;
1089
1090 # change quoted braces to brackets
1091 $text =~ s/([^\\])\\\(/$1\{/g;
1092 $text =~ s/([^\\])\\\)/$1\}/g ;
1093
1094 # remove everything that is not between braces
1095 $text =~ s/\)([^\(\)])+?\(//sg ;
1096
1097 # remove any Trailer eof stuff.
1098 $text =~ s/\)[^\)]*$//sg;
1099
1100 ### ligatures have special characters...
1101 $text =~ s/\\013/ff/g;
1102 $text =~ s/\\014/fi/g;
1103 $text =~ s/\\015/fl/g;
1104 $text =~ s/\\016/ffi/g;
1105 $text =~ s/\\214/fi/g;
1106 $text =~ s/\\215/fl/g;
1107 $text =~ s/\\017/\n\* /g; # asterisk?
1108 $text =~ s/\\023/\023/g; # e acute ('e)
1109 $text =~ s/\\177/\252/g; # u"
1110# $text =~ s/ ?? /\344/g; # a"
1111
1112 print OUT "$text";
1113 close OUT;
1114 }
1115 # wrap the text - use a minimum length. ie, first space after this length.
1116 my $wrap_length=72;
1117 &FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1118 open INFILE, "$output_filestem.text.tmp" ||
1119 die "Couldn't open file: $!";
1120 open OUTFILE, ">$output_filestem.text" ||
1121 die "Couldn't open file for writing: $!";
1122 my $line="";
1123 while ($line=<INFILE>) {
1124 while (length($line)>0) {
1125 if (length($line)>$wrap_length) {
1126 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1127 print OUTFILE "$1\n";
1128 } else {
1129 print OUTFILE "$line";
1130 $line="";
1131 }
1132 }
1133 }
1134 close INFILE;
1135 close OUTFILE;
1136 &FileUtils::removeFiles("$output_filestem.text.tmp");
1137
1138 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1139 return 1;
1140}
1141
1142
1143# Convert any file to HTML with a crude perl implementation of the
1144# UNIX strings command.
1145
1146sub any_to_html {
1147 my ($input_filename, $output_filestem) = @_;
1148
1149 # First generate a text file
1150 return 0 unless (&any_to_text($input_filename, $output_filestem));
1151
1152 # create an HTML file from the text file
1153 open(TEXT, "<$output_filestem.text");
1154 open(HTML, ">$output_filestem.html");
1155
1156 print HTML "<html><head>\n";
1157 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1158 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1159 print HTML "</head><body>\n\n";
1160
1161 my $line;
1162 while ($line=<TEXT>) {
1163 $line =~ s/</&lt;/g;
1164 $line =~ s/>/&gt;/g;
1165 if ($line =~ m/^\s*$/) {
1166 print HTML "<p>";
1167 } else {
1168 print HTML "<br> ", $line;
1169 }
1170 }
1171 print HTML "\n</body></html>\n";
1172
1173 close HTML;
1174 close TEXT;
1175
1176 &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1177 return 1;
1178}
1179
1180# Convert any file to TEXT with a crude perl implementation of the
1181# UNIX strings command.
1182# Note - this assumes ascii charsets :( (jrm21)
1183
1184sub any_to_text {
1185 my ($input_filename, $output_filestem) = @_;
1186
1187 if (!$use_strings) {
1188 return 0;
1189 }
1190
1191 print STDERR "\n**** In any to text****\n\n";
1192 open(IN, "<$input_filename") || return 0;
1193 binmode(IN);
1194 open(OUT, ">$output_filestem.text") || return 0;
1195
1196 my ($line);
1197 my $output_line_count = 0;
1198 while (<IN>) {
1199 $line = $_;
1200
1201 # delete anything that isn't a printable character
1202 $line =~ s/[^\040-\176]+/\n/sg;
1203
1204 # delete any string less than 10 characters long
1205 $line =~ s/^.{0,9}$/\n/mg;
1206 while ($line =~ m/^.{1,9}$/m) {
1207 $line =~ s/^.{0,9}$/\n/mg;
1208 $line =~ s/\n+/\n/sg;
1209 }
1210
1211 # remove extraneous whitespace
1212 $line =~ s/\n+/\n/gs;
1213 $line =~ s/^\n//gs;
1214
1215 # output whatever is left
1216 if ($line =~ m/[^\n ]/) {
1217 print OUT $line;
1218 ++$output_line_count;
1219 }
1220 }
1221
1222 close OUT;
1223 close IN;
1224
1225 if ($output_line_count) { # try to protect against binary only formats
1226 return 1;
1227 }
1228
1229 &FileUtils::removeFiles("$output_filestem.text");
1230 return 0;
1231
1232}
Note: See TracBrowser for help on using the repository browser.