source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 32273

Last change on this file since 32273 was 32273, checked in by ak19, 6 years ago

First of the commits to do with restructuring and refactoring the PDFPlugin. 1. Introducing PDFv1Plugin.pm, which only runs the old pdftohtml. pdfbox_conversion are moved into PDFv2Plugin. 2. In the meantime we still have PDFPlugin, the current state of the plugin, for backward compatibility: it uses both the old pdftohtml tool and still has the pdfbox_conversion option. Yet to introduced the PDFv2Plugin. 3. gsConvert.pl has the new flag pdf_tool, set/passed in by PDFPlugin.pm and all PDFPlugin classes hereafter. The pdf_tool flag can be set to pdftohtml, xpdftools or pdfbox. PDFv1Plugin will always set it to pdftohtml, to denote the old pdftohtml tool is to be used, whereas PDFv2Plugin will set it to xpdftools and PDFBoxConverter sets it for symmetry's sake to pdfbox, even though being an AutoLoadConverter at present, the PDFBoxConverter class bypasses gsConvert.pl. gsConvert.pl uses the pdf_tool flag to determine which tool is to be used to do the conversion to produce the selected output_type. 4. Added some strings. One for migrating users to indicate that PDFPlugin was being deprecated in favour of the PDFv1 and PDFv2 plugins. Another was referenced by CommonUntil, and more recently by PDFPlugin, but was not defined in strings.properties. Once PDFv2Plugin has been added, need to remove references to paged_html from PDFPlugin.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 43.5 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use strict;
52
53use parsargv;
54use util;
55use FileUtils;
56use Cwd;
57
58# Are we running on WinNT or Win2000 (or later)?
59my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62my $use_strings;
63my $pdf_tool;
64my $pdf_complex;
65my $pdf_nohidden;
66my $pdf_zoom;
67my $pdf_ignore_images;
68my $pdf_allow_images_only;
69my $windows_scripting;
70my $enc;
71
72sub print_usage
73{
74 print STDERR "\n";
75 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
76 print STDERR " or text using third-party programs.\n\n";
77 print STDERR " usage: $0 [options] filename\n";
78 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
79 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
80 print STDERR "\t-output\tauto|html|paged_html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
81 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
82 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
83 print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
84 print STDERR "\t-pdf_tool\tpdftohtml|xpdftools|pdfbox (not all output types are supported by every pdf_tool)\n";
85 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
86 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
87 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
88 print STDERR "\t\tconverting PDF to HTML\n";
89 print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
90 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
91 print STDERR "\t\t-pdf_complex is set\n";
92 exit(1);
93}
94
95my $faillogfile="";
96my $timeout=0;
97my $verbosity=0;
98
99sub main
100{
101 my (@ARGV) = @_;
102 my ($input_type,$output_type,$verbose);
103
104 # Dynamically figure out what the --type option can support, based on whether -windows_scripting
105 # is in use or not
106 my $default_type_re = "(doc|dot|pdf|ps|ppt|rtf|xls)";
107 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
108 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
109 # Currently only have VBA for Word and PPT(but no XLS)
110 my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xls)";
111
112 my $type_re = $default_type_re;
113
114 foreach my $a (@ARGV) {
115 if ($a =~ m/^windows_scripting$/i) {
116 $type_re = $enhanced_type_re;
117 }
118 }
119
120 # read command-line arguments
121 if (!parsargv::parse(\@ARGV,
122 "type/$type_re/", \$input_type,
123 '/errlog/.*/', \$faillogfile,
124 'output/(auto|html|text|pagedimg).*/', \$output_type, # regex includes html_multi and paged_html besides html
125 'timeout/\d+/0',\$timeout,
126 'verbose/\d+/0', \$verbose,
127 'windows_scripting',\$windows_scripting,
128 'use_strings', \$use_strings,
129 'pdf_tool/(pdftohtml|pdfbox|xpdftools)/', \$pdf_tool, # the old pdftohtml tool, pdfbox extensions or the newer xpdf-tools
130 'pdf_complex', \$pdf_complex, # options for pdf_tool = pdftohtml (the old pdftohtml tool)
131 'pdf_ignore_images', \$pdf_ignore_images,
132 'pdf_allow_images_only', \$pdf_allow_images_only,
133 'pdf_nohidden', \$pdf_nohidden,
134 'pdf_zoom/\d+/2', \$pdf_zoom
135 ))
136 {
137 print_usage();
138 }
139
140 $verbosity=$verbose if defined $verbose;
141
142 # Make sure the input file exists and can be opened for reading
143 if (scalar(@ARGV!=1)) {
144 print_usage();
145 }
146
147 my $input_filename = $ARGV[0];
148 if (!-r $input_filename) {
149 print STDERR "Error: unable to open $input_filename for reading\n";
150 exit(1);
151 }
152
153 # Deduce filenames
154 my ($tailname,$dirname,$suffix)
155 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
156 my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
157
158 if ($input_type eq "")
159 {
160 $input_type = lc (substr($suffix,1,length($suffix)-1));
161 }
162
163 # Change to temporary working directory
164 my $stored_dir = cwd();
165 chdir ($dirname) || die "Unable to change to directory $dirname";
166
167 # Select convert utility
168 if (!defined $input_type) {
169 print STDERR "Error: No filename extension or input type defined\n";
170 exit(1);
171 }
172 elsif ($input_type =~ m/^docx?$/ || $input_type eq "dot") {
173 print &convertDOC($input_filename, $output_filestem, $output_type);
174 print "\n";
175 }
176 elsif ($input_type eq "rtf") {
177 print &convertRTF($input_filename, $output_filestem, $output_type);
178 print "\n";
179 }
180 elsif ($input_type eq "pdf") {
181 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
182 print "\n";
183 }
184 elsif ($input_type eq "ps") {
185 print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
186 print "\n";
187 }
188 elsif ($input_type =~ m/pptx?$/) {
189 print &convertPPT($input_filename, $output_filestem, $output_type);
190 print "\n";
191 }
192 elsif ($input_type =~ m/xlsx?$/) {
193 print &convertXLS($input_filename, $output_filestem, $output_type);
194 print "\n";
195 }
196 else {
197 print STDERR "Error: Unable to convert type '$input_type'\n";
198 exit(1);
199 }
200
201 # restore to original working directory
202 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
203
204}
205
206&main(@ARGV);
207
208
209
210# Document-type conversion functions
211#
212# The following functions attempt to convert documents from their
213# input type to the specified output type. If no output type was
214# given, then they first attempt HTML, and then TEXT.
215#
216# Each returns the output type ("html" or "text") or "fail" if no
217# conversion is possible.
218
219# Convert a Microsoft word document
220
221sub convertDOC {
222 my ($input_filename, $output_filestem, $output_type) = @_;
223
224 # Many .doc files are not in fact word documents!
225 my $realtype = &find_docfile_type($input_filename);
226
227 if ($realtype eq "word6" || $realtype eq "word7"
228 || $realtype eq "word8" || $realtype eq "docx") {
229 return &convertWord678($input_filename, $output_filestem, $output_type);
230 } elsif ($realtype eq "rtf") {
231 return &convertRTF($input_filename, $output_filestem, $output_type);
232 } else {
233 return &convertAnything($input_filename, $output_filestem, $output_type);
234 }
235}
236
237# Convert a Microsoft word 6/7/8 document
238
239sub convertWord678 {
240 my ($input_filename, $output_filestem, $output_type) = @_;
241
242 my $success = 0;
243 if (!$output_type || ($output_type =~ m/html/i)){
244 if ($windows_scripting) {
245 $success = &native_doc_to_html($input_filename, $output_filestem);
246 }
247 else {
248 $success = &doc_to_html($input_filename, $output_filestem);
249 }
250 if ($success) {
251 return "html";
252 }
253 }
254 return &convertAnything($input_filename, $output_filestem, $output_type);
255}
256
257
258# Convert a Rich Text Format (RTF) file
259
260sub convertRTF {
261 my ($input_filename, $output_filestem, $output_type) = @_;
262
263 my $success = 0;
264
265 # Attempt specialised conversion to HTML
266 if (!$output_type || ($output_type =~ m/html/i)) {
267
268 if ($windows_scripting) {
269 $success = &native_doc_to_html($input_filename, $output_filestem);
270 }
271 else {
272 $success = &rtf_to_html($input_filename, $output_filestem);
273 }
274 if ($success) {
275 return "html";
276 }
277 }
278
279# rtf is so ugly that's it's not worth running strings over.
280# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
281# return &convertAnything($input_filename, $output_filestem, $output_type);
282 return "fail";
283}
284
285
286# Convert an unidentified file
287
288sub convertAnything {
289 my ($input_filename, $output_filestem, $output_type) = @_;
290
291 my $success = 0;
292
293 # Attempt simple conversion to HTML
294 if (!$output_type || ($output_type =~ m/html/i)) {
295 $success = &any_to_html($input_filename, $output_filestem);
296 if ($success) {
297 return "html";
298 }
299 }
300
301 # Convert to text
302 if (!$output_type || ($output_type =~ m/text/i)) {
303 $success = &any_to_text($input_filename, $output_filestem);
304 if ($success) {
305 return "text";
306 }
307 }
308 return "fail";
309}
310
311
312
313# Convert an Adobe PDF document
314
315sub convertPDF {
316 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
317
318 my $success = 0;
319 $output_type =~ s/.*\-(.*)/$1/i;
320
321 # First determine which pdf conversion tool we're using among pdftohtml/pdfbox/xpdftools
322 # and then decide which conversion command to run based on the output type
323 # (pdfbox does not currently go through gsConvert.pl
324 # as PDFBoxConverter inherits from AutoLoadConverters)
325
326 if ($pdf_tool eq "pdftohtml" ) { # old pdftohtml tool
327 # Attempt coversion to Image
328 if ($output_type =~ m/jp?g|gif|png/i) {
329 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
330 if ($success){
331 return "item";
332 }
333 }
334
335 # Attempt conversion to HTML
336 # Uses the old pdftohtml that doesn't work for newer PDF versions
337 if ($output_type =~ m/^html/i) {
338 #if (!$output_type || ($output_type =~ m/^html/i)) {
339 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
340 if ($success) {
341 return "html";
342 }
343 }
344
345 # Attempt conversion to TEXT (not for Windows, but PDFPlugin/PDFv1Plugin takes care of that
346 if (!$output_type || ($output_type =~ m/text/i)) {
347 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
348
349 if ($success) {
350 return "text";
351 }
352 }
353 }
354
355 elsif ($pdf_tool eq "xpdftools" ) {
356 # default to html output
357 if (!$output_type) {
358 $output_type = "html";
359 }
360
361 # Attempt coversion to Image
362 #if ($output_type =~ m/jp?g|gif|png/i) {
363 # $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
364 # if ($success){
365 # return "item";
366 # }
367 #}
368
369 # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools.
370 if ($output_type =~ m/^(paged_html|html)$/i) {
371 $success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
372 if ($success) {
373 return $output_type;
374 }
375 }
376
377 # Attempt conversion to TEXT
378 if (!$output_type || ($output_type =~ m/text/i)) {
379 $success = &xpdf_to_text($dirname, $input_filename, $output_filestem);
380
381 if ($success) {
382 return "text";
383 }
384 }
385 }
386
387 return "fail";
388
389}
390
391
392# Convert an Adobe PostScript document
393
394sub convertPS {
395 my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
396
397 my $success = 0;
398 $output_type =~ s/.*\-(.*)/$1/i;
399 # Attempt coversion to Image
400 if ($output_type =~ m/jp?g|gif|png/i) {
401 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
402 if ($success){
403 return "item";
404 }
405 }
406
407 # Attempt conversion to TEXT
408 if (!$output_type || ($output_type =~ m/text/i)) {
409 $success = &ps_to_text($input_filename, $output_filestem);
410 if ($success) {
411 return "text";
412 }
413 }
414 return "fail";
415}
416
417
418sub convertPPT {
419 my ($input_filename, $output_filestem, $output_type) = @_;
420 my $success = 0;
421
422 my $ppt_convert_type = "";
423
424 #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
425 if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
426 if ($output_type =~ m/gif/i) {
427 $ppt_convert_type = "-g";
428 } elsif ($output_type =~ m/jp?g/i){
429 $ppt_convert_type = "-j";
430 } elsif ($output_type =~ m/png/i){
431 $ppt_convert_type = "-p";
432 }
433 my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
434 $ENV{'GSDLOS'}, "pptextract");
435 $vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
436 # $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
437
438 my $cmd = "";
439 if ($timeout) {$cmd = "ulimit -t $timeout;";}
440 # if the converting directory already exists
441 if (-d $output_filestem) {
442 print STDERR "**The conversion directory already exists\n";
443 return "item";
444 } else {
445 $cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
446 $cmd .= " 2>\"$output_filestem.err\""
447 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
448
449 if (system($cmd) !=0) {
450 print STDERR "Powerpoint VB Scripting convert failed\n";
451 } else {
452 return "item";
453 }
454 }
455 } elsif (!$output_type || ($output_type =~ m/html/i)) {
456 # Attempt conversion to HTML
457 #if (!$output_type || ($output_type =~ m/html/i)) {
458 # formulate the command
459 my $cmd = "";
460 my $full_perl_path = &util::get_perl_exec();
461 $cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
462 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
463 $cmd .= " 2>\"$output_filestem.err\""
464 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
465
466 # execute the command
467 $!=0;
468 if (system($cmd)!=0)
469 {
470 print STDERR "Powerpoint 95/97 converter failed $!\n";
471 } else {
472 return "html";
473 }
474 }
475
476 $success = &any_to_text($input_filename, $output_filestem);
477 if ($success) {
478 return "text";
479 }
480
481 return "fail";
482}
483
484
485sub convertXLS {
486 my ($input_filename, $output_filestem, $output_type) = @_;
487
488 my $success = 0;
489
490 # Attempt conversion to HTML
491 if (!$output_type || ($output_type =~ m/html/i)) {
492 # formulate the command
493 my $cmd = "";
494 my $full_perl_path = &util::get_perl_exec();
495 $cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
496 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
497 $cmd .= " 2>\"$output_filestem.err\""
498 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
499
500
501 # execute the command
502 $!=0;
503 if (system($cmd)!=0)
504 {
505 print STDERR "Excel 95/97 converter failed $!\n";
506 } else {
507 return "html";
508 }
509 }
510
511 $success = &any_to_text($input_filename, $output_filestem);
512 if ($success) {
513 return "text";
514 }
515
516 return "fail";
517}
518
519
520
521# Find the real type of a .doc file
522#
523# We seem to have a lot of files with a .doc extension that are .rtf
524# files or Word 5 files. This function attempts to tell the difference.
525sub find_docfile_type {
526 my ($input_filename) = @_;
527
528 if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
529 return "docx";
530 }
531
532 open(CHK, "<$input_filename");
533 binmode(CHK);
534 my $line = "";
535 my $first = 1;
536
537 while (<CHK>) {
538
539 $line = $_;
540
541 if ($first) {
542 # check to see if this is an rtf file
543 if ($line =~ m/^\{\\rtf/) {
544 close(CHK);
545 return "rtf";
546 }
547 $first = 0;
548 }
549
550 # is this is a word 6/7/8 document?
551 if ($line =~ m/Word\.Document\.([678])/) {
552 close(CHK);
553
554 return "word$1";
555 }
556
557 }
558
559 return "unknown";
560}
561
562
563# Specific type-to-type conversions
564#
565# Each of the following functions attempts to convert a document from
566# a specific format to another. If they succeed they return 1 and leave
567# the output document(s) in the appropriate place; if they fail they
568# return 0 and delete any working files.
569
570
571# Attempt to convert a word document to html with the wv program
572sub doc_to_html {
573 my ($input_filename, $output_filestem) = @_;
574
575 my $wvware_status = 0;
576
577 # need to ensure that the path to perl is quoted (in case there's spaces in it)
578 my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
579
580# print STDERR "***** wvware launch cmd = $launch_cmd\n";
581
582 $wvware_status = system($launch_cmd)/256;
583 return $wvware_status;
584}
585
586# Attempt to convert a word document to html with the word2html scripting program
587sub native_doc_to_html {
588 my ($input_filename, $output_filestem) = @_;
589
590 # build up the path to the doc-to-html conversion tool we're going to use
591 my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
592
593 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
594 # if windows scripting with docx input, use new VBscript to get the local Word install (if
595 # any) to do the conversion, since docX can't be processed by word2html's windows_scripting
596
597 if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
598 # else script launch fails when there are error msgs
599 $vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
600 $vbScript = "CScript //Nologo \"$vbScript\""; # launch with CScript for error output in STDERR
601 # //Nologo flag avoids Microsoft's opening/logo msgs
602 print STDERR "About to use windows scripting to process docx file $input_filename.\n";
603 print STDERR " This may take some time. Please wait...\n";
604 }
605 else { # old doc versions. use the usual VB executable word2html for the
606 # conversion. Doesn't need full path, since bin\windows is on PATH
607 $vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
608 }
609 }
610 else { # not windows
611 $vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
612 }
613
614 if (-e "$output_filestem.html") {
615 print STDERR " The conversion file:\n";
616 print STDERR " $output_filestem.html\n";
617 print STDERR " ... already exists. Skipping\n";
618 return 1;
619 }
620
621 my $cmd = "";
622 if ($timeout) {$cmd = "ulimit -t $timeout;";}
623 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
624 #$cmd .= "$vbScript $input_filename $output_filestem.html";
625 $cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
626
627 # redirecting STDERR
628
629 $cmd .= " 2> \"$output_filestem.err\""
630 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
631 #print STDERR "@@@@@@@@@ cmd=$cmd\n";
632
633 # execute the command
634 $!=0;
635 if (system($cmd)!=0)
636 {
637 print STDERR "Error executing $vbScript converter:$!\n";
638 if (-s "$output_filestem.err") {
639 open (ERRFILE, "<$output_filestem.err");
640
641 my $write_to_fail_log=0;
642 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
643 {$write_to_fail_log=1;}
644
645 my $line;
646 while ($line=<ERRFILE>) {
647 if ($line =~ m/\w/) {
648 print STDERR "$line";
649 print FAILLOG "$line" if ($write_to_fail_log);
650 }
651 if ($line !~ m/startup error/) {next;}
652 print STDERR " (given an invalid .DOC file?)\n";
653 print FAILLOG " (given an invalid .DOC file?)\n"
654 if ($write_to_fail_log);
655
656 } # while ERRFILE
657 close FAILLOG if ($write_to_fail_log);
658 }
659 return 0; # we can try any_to_text
660 }
661
662 # Was the conversion successful?
663 if (-s "$output_filestem.html") {
664 open(TMP, "$output_filestem.html");
665 my $line = <TMP>;
666 close(TMP);
667 if ($line && $line =~ m/html/i) {
668 &FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
669 return 1;
670 }
671 }
672
673 # If here, an error of some sort occurred
674 &FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
675 if (-e "$output_filestem.err") {
676 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
677 open (ERRLOG,"$output_filestem.err");
678 while (<ERRLOG>) {print FAILLOG $_;}
679 close FAILLOG;
680 close ERRLOG;
681 }
682 &FileUtils::removeFiles("$output_filestem.err");
683 }
684 return 0;
685}
686
687# Attempt to convert an RTF document to html with rtftohtml
688sub rtf_to_html {
689 my ($input_filename, $output_filestem) = @_;
690
691 # formulate the command
692 my $cmd = "";
693 if ($timeout) {$cmd = "ulimit -t $timeout;";}
694 $cmd .= "rtftohtml";
695 #$cmd .= "rtf-converter";
696
697 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
698
699 $cmd .= " 2>\"$output_filestem.err\""
700 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
701
702
703 # execute the command
704 $!=0;
705 if (system($cmd)!=0)
706 {
707 print STDERR "Error executing rtf converter $!\n";
708 # don't currently bother printing out error log...
709 # keep going, in case it still created an HTML file...
710 }
711
712 # Was the conversion successful?
713 my $was_successful=0;
714 if (-s "$output_filestem.html") {
715 # make sure we have some content other than header
716 open (HTML, "$output_filestem.html"); # what to do if fail?
717 my $line;
718 my $past_header=0;
719 while ($line=<HTML>) {
720
721 if ($past_header == 0) {
722 if ($line =~ m/<body>/) {$past_header=1;}
723 next;
724 }
725
726 $line =~ s/<[^>]+>//g;
727 if ($line =~ m/\w/ && $past_header) { # we found some content...
728 $was_successful=1;
729 last;
730 }
731 }
732 close HTML;
733 }
734
735 if ($was_successful) {
736 &FileUtils::removeFiles("$output_filestem.err")
737 if (-e "$output_filestem.err");
738 # insert the (modified) table of contents, if it exists.
739 if (-e "${output_filestem}_ToC.html") {
740 &FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
741 my $open_failed=0;
742 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
743 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
744 open HTML, ">$output_filestem.html" || ++$open_failed;
745
746 if ($open_failed) {
747 close HTMLSRC;
748 close TOC;
749 close HTML;
750 &FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
751 return 1;
752 }
753
754 # print out header info from src html.
755 while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
756 print HTML "$_";
757 }
758
759 # print out table of contents, making links relative
760 <TOC>; <TOC>; # ignore first 2 lines
761 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
762 my $line;
763 while ($line=<TOC>) {
764 $line =~ s@</body></html>$@@i ; # only last line has this
765 # make link relative
766 $line =~ s@href=\"[^\#]+@href=\"@i;
767 print HTML $line;
768 }
769 close TOC;
770
771 # rest of html src
772 while (<HTMLSRC>) {
773 print HTML $_;
774 }
775 close HTMLSRC;
776 close HTML;
777
778 &FileUtils::removeFiles("${output_filestem}_ToC.html");
779 &FileUtils::removeFiles("${output_filestem}.src");
780 }
781 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
782 return 1; # success
783 }
784
785 if (-e "$output_filestem.err") {
786 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
787 {
788 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
789 #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
790 print FAILLOG " (rtf file might be too recent):\n";
791 open (ERRLOG, "$output_filestem.err");
792 while (<ERRLOG>) {print FAILLOG $_;}
793 close ERRLOG;
794 close FAILLOG;
795 }
796 &FileUtils::removeFiles("$output_filestem.err");
797 }
798
799 &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
800
801 return 0;
802}
803
804
805# Convert a pdf file to html with the old pdftohtml command
806# which only works for older PDF versions
807sub pdf_to_html {
808 my ($dirname, $input_filename, $output_filestem) = @_;
809
810 my $cmd = "";
811 if ($timeout) {$cmd = "ulimit -t $timeout;";}
812 my $full_perl_path = &util::get_perl_exec();
813 $cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
814 $cmd .= " -c" if ($pdf_complex);
815 $cmd .= " -i" if ($pdf_ignore_images);
816 $cmd .= " -a" if ($pdf_allow_images_only);
817 $cmd .= " -hidden" unless ($pdf_nohidden);
818 $cmd .= " \"$input_filename\" \"$output_filestem\"";
819
820 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
821 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
822 } else {
823 $cmd .= " > \"$output_filestem.err\"";
824 }
825
826 $!=0;
827
828 my $retval=system($cmd);
829 if ($retval!=0)
830 {
831 print STDERR "Error executing pdftohtml.pl";
832 if ($!) {print STDERR ": $!";}
833 print STDERR "\n";
834 }
835
836 # make sure the converter made something
837 if ($retval!=0 || ! -s "$output_filestem.html")
838 {
839 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
840 # print out the converter's std err, if any
841 if (-s "$output_filestem.err") {
842 open (ERRLOG, "$output_filestem.err") || die "$!";
843 print STDERR "pdftohtml error log:\n";
844 while (<ERRLOG>) {
845 print STDERR "$_";
846 }
847 close ERRLOG;
848 }
849 #print STDERR "***********output filestem $output_filestem.html\n";
850 &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
851 if (-e "$output_filestem.err") {
852 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
853 {
854 open (ERRLOG, "$output_filestem.err");
855 while (<ERRLOG>) {print FAILLOG $_;}
856 close ERRLOG;
857 close FAILLOG;
858 }
859 &FileUtils::removeFiles("$output_filestem.err");
860 }
861 return 0;
862 }
863
864 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
865 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
866 return 1;
867}
868
869
870# Convert a pdf file to html with the newer Xpdftools' pdftohtml
871# This generates "paged HTML" where extracted, selectable text is positioned
872# over screenshots of each page.
873# Since xpdf's pdftohtml fails if the output dir already exists and for easier
874# naming, the output files are created in a "pages" subdirectory of the tmp
875# location parent of $output_filestem instead
876sub xpdf_to_html {
877 my ($dirname, $input_filename, $output_filestem) = @_;
878
879 my $cmd = "";
880
881 # build up the path to the doc-to-html conversion tool we're going to use
882 my $xpdf_pdftohtml = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftohtml");
883
884 # We'll create the file by name $output_filestem during post-conversion processing.
885 # Note that Xpdf tools will only create its conversion products in a dir that does
886 # not yet exist. So we'll create this location as a subdir of the output_filestem's
887 # parent directory. The parent dir is the already generated tmp area for conversion. So:
888 # - tmpdir gs2build/tmp/<random-num> already exists at this stage
889 # - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
890 # - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
891 my ($tailname, $tmp_dirname, $suffix)
892 = &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
893 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
894
895 # xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
896 $cmd .= "\"$xpdf_pdftohtml\"";
897 $cmd .= " -z $pdf_zoom" if ($pdf_zoom);
898# $cmd .= " -c" if ($pdf_complex);
899# $cmd .= " -i" if ($pdf_ignore_images);
900# $cmd .= " -a" if ($pdf_allow_images_only);
901# $cmd .= " -hidden" unless ($pdf_nohidden);
902 $cmd .= " \"$input_filename\" \"$tmp_dirname\"";
903 #$cmd .= " \"$input_filename\" \"$output_filestem\"";
904
905 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
906 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
907 } else {
908 $cmd .= " > \"$output_filestem.err\"";
909 }
910
911 #print STDERR "@@@@ Running command: $cmd\n";
912
913 $!=0;
914 my $retval=system($cmd);
915 if ($retval!=0)
916 {
917 print STDERR "Error executing xpdf's pdftohtml tool";
918 if ($!) {print STDERR ": $!";}
919 print STDERR "\n";
920 }
921
922 # make sure the converter made something
923 if ($retval!=0 || ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
924 {
925 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
926 # print out the converter's std err, if any
927 if (-s "$output_filestem.err") {
928 open (ERRLOG, "$output_filestem.err") || die "$!";
929 print STDERR "pdftohtml error log:\n";
930 while (<ERRLOG>) {
931 print STDERR "$_";
932 }
933 close ERRLOG;
934 }
935 #print STDERR "***********output filestem $output_filestem.html\n";
936 &FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
937 if (-e "$output_filestem.err") {
938 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
939 {
940 open (ERRLOG, "$output_filestem.err");
941 while (<ERRLOG>) {print FAILLOG $_;}
942 close ERRLOG;
943 close FAILLOG;
944 }
945 &FileUtils::removeFiles("$output_filestem.err");
946 }
947 return 0;
948 }
949
950 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
951 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
952 return 1;
953}
954
955# Returns the path to xpdf-tools's containing bin dir appropriate for this machine's OS and bitness
956sub _get_xpdftools_bindir {
957
958 # build up the path to the containing bin dir of the xpdf conversion tool we're going to use
959 my $xpdf_tools_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools", "bin");
960 return $xpdf_tools_bin;
961}
962
963# Convert a pdf file to various types of image with the convert command
964
965sub pdfps_to_img {
966 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
967
968 # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
969 if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
970 my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
971 $imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
972 my $result = `$imagick_cmd identify 2>&1`;
973
974 # Linux and Windows return different values for "program not found".
975 # Linux returns -1 and Windows 256 for "program not found". But once they're
976 # converted to signed values, it will be -1 for Linux and 1 for Windows.
977 # Whenever we test for return values other than 0, shift by 8 and perform
978 # unsigned to signed status conversion on $? to get expected range of return vals
979 # Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
980 # and then exits on that, by the time we get here, we need to do it again
981 my $status = $?;
982 $status >>= 8;
983 $status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
984 if (($ENV{'GSDLOS'} ne "windows" && $status == -1) || ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
985 # if ($status == -1 || $status == 1) #if ($status == -1 || $status == 256) {
986 #ImageMagick is not installed, thus the convert utility is not available.
987 print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
988 return 0;
989 }
990 }
991
992 my $cmd = "";
993 if ($timeout) {$cmd = "ulimit -t $timeout;";}
994 $output_type =~ s/.*\_(.*)/$1/i;
995 my $full_perl_path = &util::get_perl_exec();
996 $cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
997 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
998 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
999 } else {
1000 $cmd .= " > \"$output_filestem.err\"";
1001 }
1002
1003 # don't include path on windows (to avoid having to play about
1004 # with quoting when GSDLHOME might contain spaces) but assume
1005 # that the PATH is set up correctly
1006 $!=0;
1007 my $retval=system($cmd);
1008 if ($retval!=0)
1009 {
1010 print STDERR "Error executing pdfpstoimg.pl";
1011 if ($!) {print STDERR ": $!";}
1012 print STDERR "\n";
1013 }
1014
1015 #make sure the converter made something
1016 #if ($retval !=0) || ! -s "$output_filestem")
1017 if ($retval !=0)
1018 {
1019 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1020 #print out the converter's std err, if any
1021 if (-s "$output_filestem.err") {
1022 open (ERRLOG, "$output_filestem.err") || die "$!";
1023 print STDERR "pdfpstoimg error log:\n";
1024 while (<ERRLOG>) {
1025 print STDERR "$_";
1026 }
1027 close ERRLOG;
1028 }
1029 #&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
1030 if (-e "$output_filestem.err") {
1031 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1032 {
1033 open (ERRLOG, "$output_filestem.err");
1034 while (<ERRLOG>) {print FAILLOG $_;}
1035 close ERRLOG;
1036 close FAILLOG;
1037 }
1038 &FileUtils::removeFiles("$output_filestem.err");
1039 }
1040 return 0;
1041 }
1042 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1043 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1044 return 1;
1045}
1046
1047# Convert a PDF file to text with xpdftools' pdftotext command
1048# Works for Windows too, whereas the old pdftotxt didn't
1049sub xpdf_to_text {
1050 my ($dirname, $input_filename, $output_filestem) = @_;
1051
1052 my $cmd = "";
1053
1054 # build up the path to the doc-to-txt conversion tool we're going to use
1055 my $xpdf_pdftotxt = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftotext");
1056
1057 # For xpdf's pdftotxt options, see https://www.xpdfreader.com/pdftotext-man.html
1058 $cmd .= "\"$xpdf_pdftotxt\"";
1059 if($enc) {
1060 $cmd .= " -enc $enc"; # decode the bytes in the file using the designated encoding scheme
1061 } else {
1062 # as per https://www.xpdfreader.com/pdftotext-man.html
1063 # xpdf's pdftotxt defaults to using Latin-1 encoding, should we default to UTF-8?
1064 $cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html
1065 }
1066 $cmd .= " -nopgbrk";
1067 # Avoid the silly solitary carriage returns (CR in Notepad) at the end
1068 # of lines that ends up as \n appended to the doc title
1069 # by setting the end of line marker to unix style solitary newline (LF or \n),
1070 # which doesn't end up in the doc title
1071 $cmd .= " -eol unix";
1072 $cmd .= " \"$input_filename\" \"$output_filestem.text\"";
1073
1074 print STDERR "@@@@ Running command: $cmd\n";
1075
1076 return _run_pdf_to_text_cmd($cmd, $output_filestem);
1077}
1078
1079# Convert a PDF file to text with the pdftotext command
1080
1081sub pdf_to_text {
1082 my ($dirname, $input_filename, $output_filestem) = @_;
1083
1084 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1085
1086 return _run_pdf_to_text_cmd($cmd, $output_filestem);
1087}
1088
1089sub _run_pdf_to_text_cmd {
1090 my ($cmd, $output_filestem) = @_;
1091
1092 if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1093 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1094 } else {
1095 $cmd .= " > \"$output_filestem.err\"";
1096 }
1097
1098 if (system($cmd)!=0)
1099 {
1100 print STDERR "Error executing $cmd: $!\n";
1101 &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1102 }
1103
1104 # make sure there is some extracted text.
1105 if (-e "$output_filestem.text") {
1106 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
1107 binmode(EXTR_TEXT); # just in case...
1108 my $line="";
1109 my $seen_text=0;
1110 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1111 if ($line=~ m/\w/) {$seen_text=1;}
1112 }
1113 close EXTR_TEXT;
1114 if ($seen_text==0) { # no text was extracted
1115 print STDERR "Error: pdftotext found no text\n";
1116 &FileUtils::removeFiles("$output_filestem.text");
1117 }
1118 }
1119
1120 # make sure the converter made something
1121 if (! -s "$output_filestem.text")
1122 {
1123 # print out the converters std err, if any
1124 if (-s "$output_filestem.err") {
1125 open (ERRLOG, "$output_filestem.err") || die "$!";
1126 print STDERR "pdftotext error log:\n";
1127 while (<ERRLOG>) {
1128 print STDERR "$_";
1129 }
1130 close ERRLOG;
1131 }
1132 # does this converter create a .out file?
1133 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1134 &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1135 if (-e "$output_filestem.err") {
1136 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1137 {
1138 open (ERRLOG,"$output_filestem.err");
1139 while (<ERRLOG>) {print FAILLOG $_;}
1140 close ERRLOG;
1141 close FAILLOG;
1142 }
1143 &FileUtils::removeFiles("$output_filestem.err");
1144 }
1145 return 0;
1146 }
1147 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1148 return 1;
1149}
1150
1151# Convert a PostScript document to text
1152# note - just using "ps2ascii" isn't good enough, as it
1153# returns 0 for a postscript interpreter error. ps2ascii is just
1154# a wrapper to "gs" anyway, so we use that cmd here.
1155
1156sub ps_to_text {
1157 my ($input_filename, $output_filestem) = @_;
1158
1159 my $error = "";
1160
1161 # if we're on windows we'll fall straight through without attempting
1162 # to use gs
1163 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1164 $error = "Windows does not support gs";
1165
1166 } else {
1167 my $cmd = "";
1168 if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1169 $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1170 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1171 #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1172 $cmd .= " 2> $output_filestem.err";
1173 $!=0;
1174
1175 my $retcode=system($cmd);
1176 $retcode = $? >> 8; # see man perlfunc - system for this...
1177 # if system returns -1 | 127 (couldn't start program), look at $! for message
1178
1179 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1180 elsif (! -e "$output_filestem.text") {
1181 $error="did not create output file.\n";
1182 }
1183 else
1184 { # make sure the interpreter didn't get an error. It is technically
1185 # possible for the actual text to start with this, but....
1186 open PSOUT, "$output_filestem.text";
1187 if (<PSOUT> =~ m/^Error: (.*)/) {
1188 $error="interpreter error - \"$1\"";
1189 }
1190 close PSOUT;
1191 }
1192 }
1193
1194 if ($error ne "")
1195 {
1196 print STDERR "Warning: Error executing gs: $error\n";
1197 print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
1198 &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1199
1200 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1201 {
1202 print FAILLOG "gs - $error\n";
1203 if (-e "$output_filestem.err") {
1204 open(ERRLOG, "$output_filestem.err");
1205 while (<ERRLOG>) {print FAILLOG $_;}
1206 close ERRLOG;
1207 }
1208 close FAILLOG;
1209 }
1210 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1211
1212
1213 # Fine then. We'll just do a lousy job by ourselves...
1214 # Based on 5-line regexp sed script found at:
1215 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1216 #
1217 print STDERR "Stripping text from postscript\n";
1218 my $errorcode=0;
1219 open (IN, "$input_filename")
1220 || ($errorcode=1, warn "Couldn't read file: $!");
1221 open (OUT, ">$output_filestem.text")
1222 || ($errorcode=1, warn "Couldn't write file: $!");
1223 if ($errorcode) {print STDERR "errors\n";return 0;}
1224
1225 my $text=""; # this is for whole .ps file...
1226 $text = join('', <IN>); # see man perlport, under "System Resources"
1227 close IN;
1228
1229 # Make sure this is a ps file...
1230 if ($text !~ m/^%!/) {
1231 print STDERR "Bad postscript header: not '%!'\n";
1232 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1233 {
1234 print FAILLOG "Bad postscript header: not '%!'\n";
1235 close FAILLOG;
1236 }
1237 return 0;
1238 }
1239
1240 # if ps has Page data, then use it to delete all stuff before it.
1241 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1242
1243 # remove all leading non-data stuff
1244 $text =~ s/^.*?\(//s;
1245
1246 # remove all newline chars for easier processing
1247 $text =~ s/\n//g;
1248
1249 # Big assumption here - assume that if any co-ordinates are
1250 # given, then we are at the end of a sentence.
1251 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1252
1253 # special characters--
1254 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1255
1256 # ? ps text formatting (eg italics?) ?
1257 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1258 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1259 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1260 # default - remove the rest
1261 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1262
1263 # attempt to add whitespace between words...
1264 # this is based purely on observation, and may be completely wrong...
1265 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1266 # eg I notice "b(" is sometimes NOT a space if preceded by a
1267 # negative number.
1268 $text =~ s/\)\d+ ?b\(/\) \( /g;
1269
1270 # change quoted braces to brackets
1271 $text =~ s/([^\\])\\\(/$1\{/g;
1272 $text =~ s/([^\\])\\\)/$1\}/g ;
1273
1274 # remove everything that is not between braces
1275 $text =~ s/\)([^\(\)])+?\(//sg ;
1276
1277 # remove any Trailer eof stuff.
1278 $text =~ s/\)[^\)]*$//sg;
1279
1280 ### ligatures have special characters...
1281 $text =~ s/\\013/ff/g;
1282 $text =~ s/\\014/fi/g;
1283 $text =~ s/\\015/fl/g;
1284 $text =~ s/\\016/ffi/g;
1285 $text =~ s/\\214/fi/g;
1286 $text =~ s/\\215/fl/g;
1287 $text =~ s/\\017/\n\* /g; # asterisk?
1288 $text =~ s/\\023/\023/g; # e acute ('e)
1289 $text =~ s/\\177/\252/g; # u"
1290# $text =~ s/ ?? /\344/g; # a"
1291
1292 print OUT "$text";
1293 close OUT;
1294 }
1295 # wrap the text - use a minimum length. ie, first space after this length.
1296 my $wrap_length=72;
1297 &FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1298 open INFILE, "$output_filestem.text.tmp" ||
1299 die "Couldn't open file: $!";
1300 open OUTFILE, ">$output_filestem.text" ||
1301 die "Couldn't open file for writing: $!";
1302 my $line="";
1303 while ($line=<INFILE>) {
1304 while (length($line)>0) {
1305 if (length($line)>$wrap_length) {
1306 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1307 print OUTFILE "$1\n";
1308 } else {
1309 print OUTFILE "$line";
1310 $line="";
1311 }
1312 }
1313 }
1314 close INFILE;
1315 close OUTFILE;
1316 &FileUtils::removeFiles("$output_filestem.text.tmp");
1317
1318 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1319 return 1;
1320}
1321
1322
1323# Convert any file to HTML with a crude perl implementation of the
1324# UNIX strings command.
1325
1326sub any_to_html {
1327 my ($input_filename, $output_filestem) = @_;
1328
1329 # First generate a text file
1330 return 0 unless (&any_to_text($input_filename, $output_filestem));
1331
1332 # create an HTML file from the text file
1333 open(TEXT, "<$output_filestem.text");
1334 open(HTML, ">$output_filestem.html");
1335
1336 print HTML "<html><head>\n";
1337 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1338 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1339 print HTML "</head><body>\n\n";
1340
1341 my $line;
1342 while ($line=<TEXT>) {
1343 $line =~ s/</&lt;/g;
1344 $line =~ s/>/&gt;/g;
1345 if ($line =~ m/^\s*$/) {
1346 print HTML "<p>";
1347 } else {
1348 print HTML "<br> ", $line;
1349 }
1350 }
1351 print HTML "\n</body></html>\n";
1352
1353 close HTML;
1354 close TEXT;
1355
1356 &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1357 return 1;
1358}
1359
1360# Convert any file to TEXT with a crude perl implementation of the
1361# UNIX strings command.
1362# Note - this assumes ascii charsets :( (jrm21)
1363
1364sub any_to_text {
1365 my ($input_filename, $output_filestem) = @_;
1366
1367 if (!$use_strings) {
1368 return 0;
1369 }
1370
1371 print STDERR "\n**** In any to text****\n\n";
1372 open(IN, "<$input_filename") || return 0;
1373 binmode(IN);
1374 open(OUT, ">$output_filestem.text") || return 0;
1375
1376 my ($line);
1377 my $output_line_count = 0;
1378 while (<IN>) {
1379 $line = $_;
1380
1381 # delete anything that isn't a printable character
1382 $line =~ s/[^\040-\176]+/\n/sg;
1383
1384 # delete any string less than 10 characters long
1385 $line =~ s/^.{0,9}$/\n/mg;
1386 while ($line =~ m/^.{1,9}$/m) {
1387 $line =~ s/^.{0,9}$/\n/mg;
1388 $line =~ s/\n+/\n/sg;
1389 }
1390
1391 # remove extraneous whitespace
1392 $line =~ s/\n+/\n/gs;
1393 $line =~ s/^\n//gs;
1394
1395 # output whatever is left
1396 if ($line =~ m/[^\n ]/) {
1397 print OUT $line;
1398 ++$output_line_count;
1399 }
1400 }
1401
1402 close OUT;
1403 close IN;
1404
1405 if ($output_line_count) { # try to protect against binary only formats
1406 return 1;
1407 }
1408
1409 &FileUtils::removeFiles("$output_filestem.text");
1410 return 0;
1411
1412}
Note: See TracBrowser for help on using the repository browser.