source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 32226

Last change on this file since 32226 was 32226, checked in by ak19, 6 years ago

Making xpdf_to_text, which uses xpdf-tools' pdftotext, the pdf to txt conversion tool for linux and mac as well. Recently used it for windows which had no prior PDF to txt conversion tool and used to output HTML. Since the introduction into GS of xpdf-tools, we can support newer pdf versions so using its pdftotxt as default tool to do PDF to txt conversions seems to be the way forward.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 43.7 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use strict;
52
53use parsargv;
54use util;
55use FileUtils;
56use Cwd;
57
58# Are we running on WinNT or Win2000 (or later)?
59my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62my $use_strings;
63my $pdf_complex;
64my $pdf_nohidden;
65my $pdf_zoom;
66my $pdf_ignore_images;
67my $pdf_allow_images_only;
68my $windows_scripting;
69my $enc;
70
71sub print_usage
72{
73 print STDERR "\n";
74 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
75 print STDERR " or text using third-party programs.\n\n";
76 print STDERR " usage: $0 [options] filename\n";
77 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
78 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
79 print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
80 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
81 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
82 print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
83 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
84 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
85 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
86 print STDERR "\t\tconverting PDF to HTML\n";
87 print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
88 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
89 print STDERR "\t\t-pdf_complex is set\n";
90 exit(1);
91}
92
93my $faillogfile="";
94my $timeout=0;
95my $verbosity=0;
96
97sub main
98{
99 my (@ARGV) = @_;
100 my ($input_type,$output_type,$verbose);
101
102 # Dynamically figure out what the --type option can support, based on whether -windows_scripting
103 # is in use or not
104 my $default_type_re = "(doc|dot|pdf|ps|ppt|rtf|xls)";
105 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
106 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
107 # Currently only have VBA for Word and PPT(but no XLS)
108 my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xls)";
109
110 my $type_re = $default_type_re;
111
112 foreach my $a (@ARGV) {
113 if ($a =~ m/^windows_scripting$/i) {
114 $type_re = $enhanced_type_re;
115 }
116 }
117
118 # read command-line arguments
119 if (!parsargv::parse(\@ARGV,
120 "type/$type_re/", \$input_type,
121 '/errlog/.*/', \$faillogfile,
122 'output/(auto|html|text|pagedimg).*/', \$output_type,
123 'timeout/\d+/0',\$timeout,
124 'verbose/\d+/0', \$verbose,
125 'windows_scripting',\$windows_scripting,
126 'use_strings', \$use_strings,
127 'pdf_complex', \$pdf_complex,
128 'pdf_ignore_images', \$pdf_ignore_images,
129 'pdf_allow_images_only', \$pdf_allow_images_only,
130 'pdf_nohidden', \$pdf_nohidden,
131 'pdf_zoom/\d+/2', \$pdf_zoom
132 ))
133 {
134 print_usage();
135 }
136
137 $verbosity=$verbose if defined $verbose;
138
139 # Make sure the input file exists and can be opened for reading
140 if (scalar(@ARGV!=1)) {
141 print_usage();
142 }
143
144 my $input_filename = $ARGV[0];
145 if (!-r $input_filename) {
146 print STDERR "Error: unable to open $input_filename for reading\n";
147 exit(1);
148 }
149
150 # Deduce filenames
151 my ($tailname,$dirname,$suffix)
152 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
153 my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
154
155 if ($input_type eq "")
156 {
157 $input_type = lc (substr($suffix,1,length($suffix)-1));
158 }
159
160 # Change to temporary working directory
161 my $stored_dir = cwd();
162 chdir ($dirname) || die "Unable to change to directory $dirname";
163
164 # Select convert utility
165 if (!defined $input_type) {
166 print STDERR "Error: No filename extension or input type defined\n";
167 exit(1);
168 }
169 elsif ($input_type =~ m/^docx?$/ || $input_type eq "dot") {
170 print &convertDOC($input_filename, $output_filestem, $output_type);
171 print "\n";
172 }
173 elsif ($input_type eq "rtf") {
174 print &convertRTF($input_filename, $output_filestem, $output_type);
175 print "\n";
176 }
177 elsif ($input_type eq "pdf") {
178 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
179 print "\n";
180 }
181 elsif ($input_type eq "ps") {
182 print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
183 print "\n";
184 }
185 elsif ($input_type =~ m/pptx?$/) {
186 print &convertPPT($input_filename, $output_filestem, $output_type);
187 print "\n";
188 }
189 elsif ($input_type =~ m/xlsx?$/) {
190 print &convertXLS($input_filename, $output_filestem, $output_type);
191 print "\n";
192 }
193 else {
194 print STDERR "Error: Unable to convert type '$input_type'\n";
195 exit(1);
196 }
197
198 # restore to original working directory
199 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
200
201}
202
203&main(@ARGV);
204
205
206
207# Document-type conversion functions
208#
209# The following functions attempt to convert documents from their
210# input type to the specified output type. If no output type was
211# given, then they first attempt HTML, and then TEXT.
212#
213# Each returns the output type ("html" or "text") or "fail" if no
214# conversion is possible.
215
216# Convert a Microsoft word document
217
218sub convertDOC {
219 my ($input_filename, $output_filestem, $output_type) = @_;
220
221 # Many .doc files are not in fact word documents!
222 my $realtype = &find_docfile_type($input_filename);
223
224 if ($realtype eq "word6" || $realtype eq "word7"
225 || $realtype eq "word8" || $realtype eq "docx") {
226 return &convertWord678($input_filename, $output_filestem, $output_type);
227 } elsif ($realtype eq "rtf") {
228 return &convertRTF($input_filename, $output_filestem, $output_type);
229 } else {
230 return &convertAnything($input_filename, $output_filestem, $output_type);
231 }
232}
233
234# Convert a Microsoft word 6/7/8 document
235
236sub convertWord678 {
237 my ($input_filename, $output_filestem, $output_type) = @_;
238
239 my $success = 0;
240 if (!$output_type || ($output_type =~ m/html/i)){
241 if ($windows_scripting) {
242 $success = &native_doc_to_html($input_filename, $output_filestem);
243 }
244 else {
245 $success = &doc_to_html($input_filename, $output_filestem);
246 }
247 if ($success) {
248 return "html";
249 }
250 }
251 return &convertAnything($input_filename, $output_filestem, $output_type);
252}
253
254
255# Convert a Rich Text Format (RTF) file
256
257sub convertRTF {
258 my ($input_filename, $output_filestem, $output_type) = @_;
259
260 my $success = 0;
261
262 # Attempt specialised conversion to HTML
263 if (!$output_type || ($output_type =~ m/html/i)) {
264
265 if ($windows_scripting) {
266 $success = &native_doc_to_html($input_filename, $output_filestem);
267 }
268 else {
269 $success = &rtf_to_html($input_filename, $output_filestem);
270 }
271 if ($success) {
272 return "html";
273 }
274 }
275
276# rtf is so ugly that's it's not worth running strings over.
277# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
278# return &convertAnything($input_filename, $output_filestem, $output_type);
279 return "fail";
280}
281
282
283# Convert an unidentified file
284
285sub convertAnything {
286 my ($input_filename, $output_filestem, $output_type) = @_;
287
288 my $success = 0;
289
290 # Attempt simple conversion to HTML
291 if (!$output_type || ($output_type =~ m/html/i)) {
292 $success = &any_to_html($input_filename, $output_filestem);
293 if ($success) {
294 return "html";
295 }
296 }
297
298 # Convert to text
299 if (!$output_type || ($output_type =~ m/text/i)) {
300 $success = &any_to_text($input_filename, $output_filestem);
301 if ($success) {
302 return "text";
303 }
304 }
305 return "fail";
306}
307
308
309
310# Convert an Adobe PDF document
311
312sub convertPDF {
313 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
314
315 my $success = 0;
316 $output_type =~ s/.*\-(.*)/$1/i;
317 # Attempt coversion to Image
318 if ($output_type =~ m/jp?g|gif|png/i) {
319 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
320 if ($success){
321 return "item";
322 }
323 }
324
325 # Attempt conversion to HTML
326 # Uses the old pdftohtml that doesn't work for newer PDF versions
327 if ($output_type =~ m/^html/i) {
328 #if (!$output_type || ($output_type =~ m/^html/i)) {
329 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
330 if ($success) {
331 return "html";
332 }
333 }
334
335 # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This
336 # will be the new default for PDFs when output_type for PDF docs is not specified
337 # (once our use of xpdftools' pdftohtml has been implemented on win and mac).
338 #if ($output_type =~ m/paged_html/i) {
339 if (!$output_type || ($output_type =~ m/paged_html/i)) {
340 $success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
341 if ($success) {
342 return "paged_html";
343 }
344 }
345
346 # Attempt conversion to TEXT
347 if (!$output_type || ($output_type =~ m/text/i)) {
348 $success = &xpdf_to_text($dirname, $input_filename, $output_filestem);
349 #if ($ENV{'GSDLOS'} =~ m/^windows$/i) { # we now have pdf to text support for windows by using xpdf tools
350 # $success = &xpdf_to_text($dirname, $input_filename, $output_filestem);
351 #} else {
352 # $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
353 #}
354 if ($success) {
355 return "text";
356 }
357 }
358
359 return "fail";
360
361}
362
363
364# Convert an Adobe PostScript document
365
366sub convertPS {
367 my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
368
369 my $success = 0;
370 $output_type =~ s/.*\-(.*)/$1/i;
371 # Attempt coversion to Image
372 if ($output_type =~ m/jp?g|gif|png/i) {
373 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
374 if ($success){
375 return "item";
376 }
377 }
378
379 # Attempt conversion to TEXT
380 if (!$output_type || ($output_type =~ m/text/i)) {
381 $success = &ps_to_text($input_filename, $output_filestem);
382 if ($success) {
383 return "text";
384 }
385 }
386 return "fail";
387}
388
389
390sub convertPPT {
391 my ($input_filename, $output_filestem, $output_type) = @_;
392 my $success = 0;
393
394 my $ppt_convert_type = "";
395
396 #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
397 if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
398 if ($output_type =~ m/gif/i) {
399 $ppt_convert_type = "-g";
400 } elsif ($output_type =~ m/jp?g/i){
401 $ppt_convert_type = "-j";
402 } elsif ($output_type =~ m/png/i){
403 $ppt_convert_type = "-p";
404 }
405 my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
406 $ENV{'GSDLOS'}, "pptextract");
407 $vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
408 # $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
409
410 my $cmd = "";
411 if ($timeout) {$cmd = "ulimit -t $timeout;";}
412 # if the converting directory already exists
413 if (-d $output_filestem) {
414 print STDERR "**The conversion directory already exists\n";
415 return "item";
416 } else {
417 $cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
418 $cmd .= " 2>\"$output_filestem.err\""
419 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
420
421 if (system($cmd) !=0) {
422 print STDERR "Powerpoint VB Scripting convert failed\n";
423 } else {
424 return "item";
425 }
426 }
427 } elsif (!$output_type || ($output_type =~ m/html/i)) {
428 # Attempt conversion to HTML
429 #if (!$output_type || ($output_type =~ m/html/i)) {
430 # formulate the command
431 my $cmd = "";
432 my $full_perl_path = &util::get_perl_exec();
433 $cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
434 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
435 $cmd .= " 2>\"$output_filestem.err\""
436 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
437
438 # execute the command
439 $!=0;
440 if (system($cmd)!=0)
441 {
442 print STDERR "Powerpoint 95/97 converter failed $!\n";
443 } else {
444 return "html";
445 }
446 }
447
448 $success = &any_to_text($input_filename, $output_filestem);
449 if ($success) {
450 return "text";
451 }
452
453 return "fail";
454}
455
456
457sub convertXLS {
458 my ($input_filename, $output_filestem, $output_type) = @_;
459
460 my $success = 0;
461
462 # Attempt conversion to HTML
463 if (!$output_type || ($output_type =~ m/html/i)) {
464 # formulate the command
465 my $cmd = "";
466 my $full_perl_path = &util::get_perl_exec();
467 $cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
468 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
469 $cmd .= " 2>\"$output_filestem.err\""
470 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
471
472
473 # execute the command
474 $!=0;
475 if (system($cmd)!=0)
476 {
477 print STDERR "Excel 95/97 converter failed $!\n";
478 } else {
479 return "html";
480 }
481 }
482
483 $success = &any_to_text($input_filename, $output_filestem);
484 if ($success) {
485 return "text";
486 }
487
488 return "fail";
489}
490
491
492
493# Find the real type of a .doc file
494#
495# We seem to have a lot of files with a .doc extension that are .rtf
496# files or Word 5 files. This function attempts to tell the difference.
497sub find_docfile_type {
498 my ($input_filename) = @_;
499
500 if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
501 return "docx";
502 }
503
504 open(CHK, "<$input_filename");
505 binmode(CHK);
506 my $line = "";
507 my $first = 1;
508
509 while (<CHK>) {
510
511 $line = $_;
512
513 if ($first) {
514 # check to see if this is an rtf file
515 if ($line =~ m/^\{\\rtf/) {
516 close(CHK);
517 return "rtf";
518 }
519 $first = 0;
520 }
521
522 # is this is a word 6/7/8 document?
523 if ($line =~ m/Word\.Document\.([678])/) {
524 close(CHK);
525
526 return "word$1";
527 }
528
529 }
530
531 return "unknown";
532}
533
534
535# Specific type-to-type conversions
536#
537# Each of the following functions attempts to convert a document from
538# a specific format to another. If they succeed they return 1 and leave
539# the output document(s) in the appropriate place; if they fail they
540# return 0 and delete any working files.
541
542
543# Attempt to convert a word document to html with the wv program
544sub doc_to_html {
545 my ($input_filename, $output_filestem) = @_;
546
547 my $wvware_status = 0;
548
549 # need to ensure that the path to perl is quoted (in case there's spaces in it)
550 my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
551
552# print STDERR "***** wvware launch cmd = $launch_cmd\n";
553
554 $wvware_status = system($launch_cmd)/256;
555 return $wvware_status;
556}
557
558# Attempt to convert a word document to html with the word2html scripting program
559sub native_doc_to_html {
560 my ($input_filename, $output_filestem) = @_;
561
562 # build up the path to the doc-to-html conversion tool we're going to use
563 my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
564
565 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
566 # if windows scripting with docx input, use new VBscript to get the local Word install (if
567 # any) to do the conversion, since docX can't be processed by word2html's windows_scripting
568
569 if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
570 # else script launch fails when there are error msgs
571 $vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
572 $vbScript = "CScript //Nologo \"$vbScript\""; # launch with CScript for error output in STDERR
573 # //Nologo flag avoids Microsoft's opening/logo msgs
574 print STDERR "About to use windows scripting to process docx file $input_filename.\n";
575 print STDERR " This may take some time. Please wait...\n";
576 }
577 else { # old doc versions. use the usual VB executable word2html for the
578 # conversion. Doesn't need full path, since bin\windows is on PATH
579 $vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
580 }
581 }
582 else { # not windows
583 $vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
584 }
585
586 if (-e "$output_filestem.html") {
587 print STDERR " The conversion file:\n";
588 print STDERR " $output_filestem.html\n";
589 print STDERR " ... already exists. Skipping\n";
590 return 1;
591 }
592
593 my $cmd = "";
594 if ($timeout) {$cmd = "ulimit -t $timeout;";}
595 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
596 #$cmd .= "$vbScript $input_filename $output_filestem.html";
597 $cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
598
599 # redirecting STDERR
600
601 $cmd .= " 2> \"$output_filestem.err\""
602 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
603 #print STDERR "@@@@@@@@@ cmd=$cmd\n";
604
605 # execute the command
606 $!=0;
607 if (system($cmd)!=0)
608 {
609 print STDERR "Error executing $vbScript converter:$!\n";
610 if (-s "$output_filestem.err") {
611 open (ERRFILE, "<$output_filestem.err");
612
613 my $write_to_fail_log=0;
614 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
615 {$write_to_fail_log=1;}
616
617 my $line;
618 while ($line=<ERRFILE>) {
619 if ($line =~ m/\w/) {
620 print STDERR "$line";
621 print FAILLOG "$line" if ($write_to_fail_log);
622 }
623 if ($line !~ m/startup error/) {next;}
624 print STDERR " (given an invalid .DOC file?)\n";
625 print FAILLOG " (given an invalid .DOC file?)\n"
626 if ($write_to_fail_log);
627
628 } # while ERRFILE
629 close FAILLOG if ($write_to_fail_log);
630 }
631 return 0; # we can try any_to_text
632 }
633
634 # Was the conversion successful?
635 if (-s "$output_filestem.html") {
636 open(TMP, "$output_filestem.html");
637 my $line = <TMP>;
638 close(TMP);
639 if ($line && $line =~ m/html/i) {
640 &FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
641 return 1;
642 }
643 }
644
645 # If here, an error of some sort occurred
646 &FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
647 if (-e "$output_filestem.err") {
648 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
649 open (ERRLOG,"$output_filestem.err");
650 while (<ERRLOG>) {print FAILLOG $_;}
651 close FAILLOG;
652 close ERRLOG;
653 }
654 &FileUtils::removeFiles("$output_filestem.err");
655 }
656 return 0;
657}
658
659# Attempt to convert an RTF document to html with rtftohtml
660sub rtf_to_html {
661 my ($input_filename, $output_filestem) = @_;
662
663 # formulate the command
664 my $cmd = "";
665 if ($timeout) {$cmd = "ulimit -t $timeout;";}
666 $cmd .= "rtftohtml";
667 #$cmd .= "rtf-converter";
668
669 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
670
671 $cmd .= " 2>\"$output_filestem.err\""
672 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
673
674
675 # execute the command
676 $!=0;
677 if (system($cmd)!=0)
678 {
679 print STDERR "Error executing rtf converter $!\n";
680 # don't currently bother printing out error log...
681 # keep going, in case it still created an HTML file...
682 }
683
684 # Was the conversion successful?
685 my $was_successful=0;
686 if (-s "$output_filestem.html") {
687 # make sure we have some content other than header
688 open (HTML, "$output_filestem.html"); # what to do if fail?
689 my $line;
690 my $past_header=0;
691 while ($line=<HTML>) {
692
693 if ($past_header == 0) {
694 if ($line =~ m/<body>/) {$past_header=1;}
695 next;
696 }
697
698 $line =~ s/<[^>]+>//g;
699 if ($line =~ m/\w/ && $past_header) { # we found some content...
700 $was_successful=1;
701 last;
702 }
703 }
704 close HTML;
705 }
706
707 if ($was_successful) {
708 &FileUtils::removeFiles("$output_filestem.err")
709 if (-e "$output_filestem.err");
710 # insert the (modified) table of contents, if it exists.
711 if (-e "${output_filestem}_ToC.html") {
712 &FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
713 my $open_failed=0;
714 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
715 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
716 open HTML, ">$output_filestem.html" || ++$open_failed;
717
718 if ($open_failed) {
719 close HTMLSRC;
720 close TOC;
721 close HTML;
722 &FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
723 return 1;
724 }
725
726 # print out header info from src html.
727 while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
728 print HTML "$_";
729 }
730
731 # print out table of contents, making links relative
732 <TOC>; <TOC>; # ignore first 2 lines
733 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
734 my $line;
735 while ($line=<TOC>) {
736 $line =~ s@</body></html>$@@i ; # only last line has this
737 # make link relative
738 $line =~ s@href=\"[^\#]+@href=\"@i;
739 print HTML $line;
740 }
741 close TOC;
742
743 # rest of html src
744 while (<HTMLSRC>) {
745 print HTML $_;
746 }
747 close HTMLSRC;
748 close HTML;
749
750 &FileUtils::removeFiles("${output_filestem}_ToC.html");
751 &FileUtils::removeFiles("${output_filestem}.src");
752 }
753 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
754 return 1; # success
755 }
756
757 if (-e "$output_filestem.err") {
758 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
759 {
760 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
761 #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
762 print FAILLOG " (rtf file might be too recent):\n";
763 open (ERRLOG, "$output_filestem.err");
764 while (<ERRLOG>) {print FAILLOG $_;}
765 close ERRLOG;
766 close FAILLOG;
767 }
768 &FileUtils::removeFiles("$output_filestem.err");
769 }
770
771 &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
772
773 return 0;
774}
775
776
777# Convert a pdf file to html with the old pdftohtml command
778# which only works for older PDF versions
779sub pdf_to_html {
780 my ($dirname, $input_filename, $output_filestem) = @_;
781
782 my $cmd = "";
783 if ($timeout) {$cmd = "ulimit -t $timeout;";}
784 my $full_perl_path = &util::get_perl_exec();
785 $cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
786 $cmd .= " -c" if ($pdf_complex);
787 $cmd .= " -i" if ($pdf_ignore_images);
788 $cmd .= " -a" if ($pdf_allow_images_only);
789 $cmd .= " -hidden" unless ($pdf_nohidden);
790 $cmd .= " \"$input_filename\" \"$output_filestem\"";
791
792 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
793 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
794 } else {
795 $cmd .= " > \"$output_filestem.err\"";
796 }
797
798 $!=0;
799
800 my $retval=system($cmd);
801 if ($retval!=0)
802 {
803 print STDERR "Error executing pdftohtml.pl";
804 if ($!) {print STDERR ": $!";}
805 print STDERR "\n";
806 }
807
808 # make sure the converter made something
809 if ($retval!=0 || ! -s "$output_filestem.html")
810 {
811 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
812 # print out the converter's std err, if any
813 if (-s "$output_filestem.err") {
814 open (ERRLOG, "$output_filestem.err") || die "$!";
815 print STDERR "pdftohtml error log:\n";
816 while (<ERRLOG>) {
817 print STDERR "$_";
818 }
819 close ERRLOG;
820 }
821 #print STDERR "***********output filestem $output_filestem.html\n";
822 &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
823 if (-e "$output_filestem.err") {
824 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
825 {
826 open (ERRLOG, "$output_filestem.err");
827 while (<ERRLOG>) {print FAILLOG $_;}
828 close ERRLOG;
829 close FAILLOG;
830 }
831 &FileUtils::removeFiles("$output_filestem.err");
832 }
833 return 0;
834 }
835
836 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
837 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
838 return 1;
839}
840
841
842# Convert a pdf file to html with the newer Xpdftools' pdftohtml
843# This generates "paged HTML" where extracted, selectable text is positioned
844# over screenshots of each page.
845# Since xpdf's pdftohtml fails if the output dir already exists and for easier
846# naming, the output files are created in a "pages" subdirectory of the tmp
847# location parent of $output_filestem instead
848sub xpdf_to_html {
849 my ($dirname, $input_filename, $output_filestem) = @_;
850
851 my $cmd = "";
852
853 # build up the path to the doc-to-html conversion tool we're going to use
854 my $xpdf_pdftohtml = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftohtml");
855
856 # We'll create the file by name $output_filestem during post-conversion processing.
857 # Note that Xpdf tools will only create its conversion products in a dir that does
858 # not yet exist. So we'll create this location as a subdir of the output_filestem's
859 # parent directory. The parent dir is the already generated tmp area for conversion. So:
860 # - tmpdir gs2build/tmp/<random-num> already exists at this stage
861 # - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
862 # - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
863 my ($tailname, $tmp_dirname, $suffix)
864 = &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
865 $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
866
867 # xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
868 $cmd .= "\"$xpdf_pdftohtml\"";
869 $cmd .= " -z $pdf_zoom" if ($pdf_zoom);
870# $cmd .= " -c" if ($pdf_complex);
871# $cmd .= " -i" if ($pdf_ignore_images);
872# $cmd .= " -a" if ($pdf_allow_images_only);
873# $cmd .= " -hidden" unless ($pdf_nohidden);
874 $cmd .= " \"$input_filename\" \"$tmp_dirname\"";
875 #$cmd .= " \"$input_filename\" \"$output_filestem\"";
876
877 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
878 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
879 } else {
880 $cmd .= " > \"$output_filestem.err\"";
881 }
882
883 #print STDERR "@@@@ Running command: $cmd\n";
884
885 $!=0;
886 my $retval=system($cmd);
887 if ($retval!=0)
888 {
889 print STDERR "Error executing xpdf's pdftohtml tool";
890 if ($!) {print STDERR ": $!";}
891 print STDERR "\n";
892 }
893
894 # make sure the converter made something
895 if ($retval!=0 || ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
896 {
897 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
898 # print out the converter's std err, if any
899 if (-s "$output_filestem.err") {
900 open (ERRLOG, "$output_filestem.err") || die "$!";
901 print STDERR "pdftohtml error log:\n";
902 while (<ERRLOG>) {
903 print STDERR "$_";
904 }
905 close ERRLOG;
906 }
907 #print STDERR "***********output filestem $output_filestem.html\n";
908 &FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
909 if (-e "$output_filestem.err") {
910 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
911 {
912 open (ERRLOG, "$output_filestem.err");
913 while (<ERRLOG>) {print FAILLOG $_;}
914 close ERRLOG;
915 close FAILLOG;
916 }
917 &FileUtils::removeFiles("$output_filestem.err");
918 }
919 return 0;
920 }
921
922 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
923 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
924 return 1;
925}
926
927# Returns the path to xpdf-tools's containing bin dir appropriate for this machine's OS and bitness
928sub _get_xpdftools_bindir {
929
930 # build up the path to the containing bin dir of the xpdf conversion tool we're going to use
931 my $xpdf_tools_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools");
932
933 if($ENV{'GSDLOS'} =~ m/^windows$/i) { # For Windows, just use the 32 bit xpdf's pdftohtml as it works the same as the 64 bit
934 $xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin32");
935 } else { # unix (linux|darwin), use the bin32/bin64 folder depending on the BITNESS env var
936
937 # Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since $ENV{'GSDLARCH'}
938 # isn't always set and has side-effects when it is set:
939 # $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit
940 # specific subdirectories exist in a greenstone installation.
941 # None of those locations need exist when xpdf-tools is installed with GS.
942 # So don't depend on GSDLARCH as forcing that to be exported has side-effects
943 if($ENV{'BITNESS'}) {
944 $xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin".$ENV{'BITNESS'});
945 } else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64?
946 $xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin32");
947 }
948 }
949
950 return $xpdf_tools_bin;
951}
952
953# Convert a pdf file to various types of image with the convert command
954
955sub pdfps_to_img {
956 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
957
958 # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
959 if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
960 my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
961 $imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
962 my $result = `$imagick_cmd identify 2>&1`;
963
964 # Linux and Windows return different values for "program not found".
965 # Linux returns -1 and Windows 256 for "program not found". But once they're
966 # converted to signed values, it will be -1 for Linux and 1 for Windows.
967 # Whenever we test for return values other than 0, shift by 8 and perform
968 # unsigned to signed status conversion on $? to get expected range of return vals
969 # Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
970 # and then exits on that, by the time we get here, we need to do it again
971 my $status = $?;
972 $status >>= 8;
973 $status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
974 if (($ENV{'GSDLOS'} ne "windows" && $status == -1) || ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
975 # if ($status == -1 || $status == 1) #if ($status == -1 || $status == 256) {
976 #ImageMagick is not installed, thus the convert utility is not available.
977 print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
978 return 0;
979 }
980 }
981
982 my $cmd = "";
983 if ($timeout) {$cmd = "ulimit -t $timeout;";}
984 $output_type =~ s/.*\_(.*)/$1/i;
985 my $full_perl_path = &util::get_perl_exec();
986 $cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
987 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
988 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
989 } else {
990 $cmd .= " > \"$output_filestem.err\"";
991 }
992
993 # don't include path on windows (to avoid having to play about
994 # with quoting when GSDLHOME might contain spaces) but assume
995 # that the PATH is set up correctly
996 $!=0;
997 my $retval=system($cmd);
998 if ($retval!=0)
999 {
1000 print STDERR "Error executing pdfpstoimg.pl";
1001 if ($!) {print STDERR ": $!";}
1002 print STDERR "\n";
1003 }
1004
1005 #make sure the converter made something
1006 #if ($retval !=0) || ! -s "$output_filestem")
1007 if ($retval !=0)
1008 {
1009 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1010 #print out the converter's std err, if any
1011 if (-s "$output_filestem.err") {
1012 open (ERRLOG, "$output_filestem.err") || die "$!";
1013 print STDERR "pdfpstoimg error log:\n";
1014 while (<ERRLOG>) {
1015 print STDERR "$_";
1016 }
1017 close ERRLOG;
1018 }
1019 #&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
1020 if (-e "$output_filestem.err") {
1021 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1022 {
1023 open (ERRLOG, "$output_filestem.err");
1024 while (<ERRLOG>) {print FAILLOG $_;}
1025 close ERRLOG;
1026 close FAILLOG;
1027 }
1028 &FileUtils::removeFiles("$output_filestem.err");
1029 }
1030 return 0;
1031 }
1032 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1033 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1034 return 1;
1035}
1036
1037# Convert a PDF file to text with xpdftools' pdftotext command
1038# Works for Windows too, whereas the old pdftotxt didn't
1039sub xpdf_to_text {
1040 my ($dirname, $input_filename, $output_filestem) = @_;
1041
1042 my $cmd = "";
1043
1044 # build up the path to the doc-to-txt conversion tool we're going to use
1045 my $xpdf_pdftotxt = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftotext");
1046
1047 # For xpdf's pdftotxt options, see https://www.xpdfreader.com/pdftotext-man.html
1048 $cmd .= "\"$xpdf_pdftotxt\"";
1049 if($enc) {
1050 $cmd .= " -enc $enc"; # decode the bytes in the file using the designated encoding scheme
1051 } else {
1052 # as per https://www.xpdfreader.com/pdftotext-man.html
1053 # xpdf's pdftotxt defaults to using Latin-1 encoding, should we default to UTF-8?
1054 $cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html
1055 }
1056 $cmd .= " -nopgbrk";
1057 # Avoid the silly solitary carriage returns (CR in Notepad) at the end
1058 # of lines that ends up as \n appended to the doc title
1059 # by setting the end of line marker to unix style solitary newline (LF or \n),
1060 # which doesn't end up in the doc title
1061 $cmd .= " -eol unix";
1062 $cmd .= " \"$input_filename\" \"$output_filestem.text\"";
1063
1064 print STDERR "@@@@ Running command: $cmd\n";
1065
1066 return _run_pdf_to_text_cmd($cmd, $output_filestem);
1067}
1068
1069# Convert a PDF file to text with the pdftotext command
1070
1071sub pdf_to_text {
1072 my ($dirname, $input_filename, $output_filestem) = @_;
1073
1074 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1075
1076 return _run_pdf_to_text_cmd($cmd, $output_filestem);
1077}
1078
1079sub _run_pdf_to_text_cmd {
1080 my ($cmd, $output_filestem) = @_;
1081
1082 if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1083 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1084 } else {
1085 $cmd .= " > \"$output_filestem.err\"";
1086 }
1087
1088 if (system($cmd)!=0)
1089 {
1090 print STDERR "Error executing $cmd: $!\n";
1091 &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1092 }
1093
1094 # make sure there is some extracted text.
1095 if (-e "$output_filestem.text") {
1096 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
1097 binmode(EXTR_TEXT); # just in case...
1098 my $line="";
1099 my $seen_text=0;
1100 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1101 if ($line=~ m/\w/) {$seen_text=1;}
1102 }
1103 close EXTR_TEXT;
1104 if ($seen_text==0) { # no text was extracted
1105 print STDERR "Error: pdftotext found no text\n";
1106 &FileUtils::removeFiles("$output_filestem.text");
1107 }
1108 }
1109
1110 # make sure the converter made something
1111 if (! -s "$output_filestem.text")
1112 {
1113 # print out the converters std err, if any
1114 if (-s "$output_filestem.err") {
1115 open (ERRLOG, "$output_filestem.err") || die "$!";
1116 print STDERR "pdftotext error log:\n";
1117 while (<ERRLOG>) {
1118 print STDERR "$_";
1119 }
1120 close ERRLOG;
1121 }
1122 # does this converter create a .out file?
1123 &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1124 &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1125 if (-e "$output_filestem.err") {
1126 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1127 {
1128 open (ERRLOG,"$output_filestem.err");
1129 while (<ERRLOG>) {print FAILLOG $_;}
1130 close ERRLOG;
1131 close FAILLOG;
1132 }
1133 &FileUtils::removeFiles("$output_filestem.err");
1134 }
1135 return 0;
1136 }
1137 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1138 return 1;
1139}
1140
1141# Convert a PostScript document to text
1142# note - just using "ps2ascii" isn't good enough, as it
1143# returns 0 for a postscript interpreter error. ps2ascii is just
1144# a wrapper to "gs" anyway, so we use that cmd here.
1145
1146sub ps_to_text {
1147 my ($input_filename, $output_filestem) = @_;
1148
1149 my $error = "";
1150
1151 # if we're on windows we'll fall straight through without attempting
1152 # to use gs
1153 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1154 $error = "Windows does not support gs";
1155
1156 } else {
1157 my $cmd = "";
1158 if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1159 $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1160 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1161 #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1162 $cmd .= " 2> $output_filestem.err";
1163 $!=0;
1164
1165 my $retcode=system($cmd);
1166 $retcode = $? >> 8; # see man perlfunc - system for this...
1167 # if system returns -1 | 127 (couldn't start program), look at $! for message
1168
1169 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1170 elsif (! -e "$output_filestem.text") {
1171 $error="did not create output file.\n";
1172 }
1173 else
1174 { # make sure the interpreter didn't get an error. It is technically
1175 # possible for the actual text to start with this, but....
1176 open PSOUT, "$output_filestem.text";
1177 if (<PSOUT> =~ m/^Error: (.*)/) {
1178 $error="interpreter error - \"$1\"";
1179 }
1180 close PSOUT;
1181 }
1182 }
1183
1184 if ($error ne "")
1185 {
1186 print STDERR "Warning: Error executing gs: $error\n";
1187 print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
1188 &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1189
1190 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1191 {
1192 print FAILLOG "gs - $error\n";
1193 if (-e "$output_filestem.err") {
1194 open(ERRLOG, "$output_filestem.err");
1195 while (<ERRLOG>) {print FAILLOG $_;}
1196 close ERRLOG;
1197 }
1198 close FAILLOG;
1199 }
1200 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1201
1202
1203 # Fine then. We'll just do a lousy job by ourselves...
1204 # Based on 5-line regexp sed script found at:
1205 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1206 #
1207 print STDERR "Stripping text from postscript\n";
1208 my $errorcode=0;
1209 open (IN, "$input_filename")
1210 || ($errorcode=1, warn "Couldn't read file: $!");
1211 open (OUT, ">$output_filestem.text")
1212 || ($errorcode=1, warn "Couldn't write file: $!");
1213 if ($errorcode) {print STDERR "errors\n";return 0;}
1214
1215 my $text=""; # this is for whole .ps file...
1216 $text = join('', <IN>); # see man perlport, under "System Resources"
1217 close IN;
1218
1219 # Make sure this is a ps file...
1220 if ($text !~ m/^%!/) {
1221 print STDERR "Bad postscript header: not '%!'\n";
1222 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1223 {
1224 print FAILLOG "Bad postscript header: not '%!'\n";
1225 close FAILLOG;
1226 }
1227 return 0;
1228 }
1229
1230 # if ps has Page data, then use it to delete all stuff before it.
1231 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1232
1233 # remove all leading non-data stuff
1234 $text =~ s/^.*?\(//s;
1235
1236 # remove all newline chars for easier processing
1237 $text =~ s/\n//g;
1238
1239 # Big assumption here - assume that if any co-ordinates are
1240 # given, then we are at the end of a sentence.
1241 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1242
1243 # special characters--
1244 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1245
1246 # ? ps text formatting (eg italics?) ?
1247 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1248 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1249 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1250 # default - remove the rest
1251 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1252
1253 # attempt to add whitespace between words...
1254 # this is based purely on observation, and may be completely wrong...
1255 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1256 # eg I notice "b(" is sometimes NOT a space if preceded by a
1257 # negative number.
1258 $text =~ s/\)\d+ ?b\(/\) \( /g;
1259
1260 # change quoted braces to brackets
1261 $text =~ s/([^\\])\\\(/$1\{/g;
1262 $text =~ s/([^\\])\\\)/$1\}/g ;
1263
1264 # remove everything that is not between braces
1265 $text =~ s/\)([^\(\)])+?\(//sg ;
1266
1267 # remove any Trailer eof stuff.
1268 $text =~ s/\)[^\)]*$//sg;
1269
1270 ### ligatures have special characters...
1271 $text =~ s/\\013/ff/g;
1272 $text =~ s/\\014/fi/g;
1273 $text =~ s/\\015/fl/g;
1274 $text =~ s/\\016/ffi/g;
1275 $text =~ s/\\214/fi/g;
1276 $text =~ s/\\215/fl/g;
1277 $text =~ s/\\017/\n\* /g; # asterisk?
1278 $text =~ s/\\023/\023/g; # e acute ('e)
1279 $text =~ s/\\177/\252/g; # u"
1280# $text =~ s/ ?? /\344/g; # a"
1281
1282 print OUT "$text";
1283 close OUT;
1284 }
1285 # wrap the text - use a minimum length. ie, first space after this length.
1286 my $wrap_length=72;
1287 &FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1288 open INFILE, "$output_filestem.text.tmp" ||
1289 die "Couldn't open file: $!";
1290 open OUTFILE, ">$output_filestem.text" ||
1291 die "Couldn't open file for writing: $!";
1292 my $line="";
1293 while ($line=<INFILE>) {
1294 while (length($line)>0) {
1295 if (length($line)>$wrap_length) {
1296 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1297 print OUTFILE "$1\n";
1298 } else {
1299 print OUTFILE "$line";
1300 $line="";
1301 }
1302 }
1303 }
1304 close INFILE;
1305 close OUTFILE;
1306 &FileUtils::removeFiles("$output_filestem.text.tmp");
1307
1308 &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1309 return 1;
1310}
1311
1312
1313# Convert any file to HTML with a crude perl implementation of the
1314# UNIX strings command.
1315
1316sub any_to_html {
1317 my ($input_filename, $output_filestem) = @_;
1318
1319 # First generate a text file
1320 return 0 unless (&any_to_text($input_filename, $output_filestem));
1321
1322 # create an HTML file from the text file
1323 open(TEXT, "<$output_filestem.text");
1324 open(HTML, ">$output_filestem.html");
1325
1326 print HTML "<html><head>\n";
1327 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1328 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1329 print HTML "</head><body>\n\n";
1330
1331 my $line;
1332 while ($line=<TEXT>) {
1333 $line =~ s/</&lt;/g;
1334 $line =~ s/>/&gt;/g;
1335 if ($line =~ m/^\s*$/) {
1336 print HTML "<p>";
1337 } else {
1338 print HTML "<br> ", $line;
1339 }
1340 }
1341 print HTML "\n</body></html>\n";
1342
1343 close HTML;
1344 close TEXT;
1345
1346 &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1347 return 1;
1348}
1349
1350# Convert any file to TEXT with a crude perl implementation of the
1351# UNIX strings command.
1352# Note - this assumes ascii charsets :( (jrm21)
1353
1354sub any_to_text {
1355 my ($input_filename, $output_filestem) = @_;
1356
1357 if (!$use_strings) {
1358 return 0;
1359 }
1360
1361 print STDERR "\n**** In any to text****\n\n";
1362 open(IN, "<$input_filename") || return 0;
1363 binmode(IN);
1364 open(OUT, ">$output_filestem.text") || return 0;
1365
1366 my ($line);
1367 my $output_line_count = 0;
1368 while (<IN>) {
1369 $line = $_;
1370
1371 # delete anything that isn't a printable character
1372 $line =~ s/[^\040-\176]+/\n/sg;
1373
1374 # delete any string less than 10 characters long
1375 $line =~ s/^.{0,9}$/\n/mg;
1376 while ($line =~ m/^.{1,9}$/m) {
1377 $line =~ s/^.{0,9}$/\n/mg;
1378 $line =~ s/\n+/\n/sg;
1379 }
1380
1381 # remove extraneous whitespace
1382 $line =~ s/\n+/\n/gs;
1383 $line =~ s/^\n//gs;
1384
1385 # output whatever is left
1386 if ($line =~ m/[^\n ]/) {
1387 print OUT $line;
1388 ++$output_line_count;
1389 }
1390 }
1391
1392 close OUT;
1393 close IN;
1394
1395 if ($output_line_count) { # try to protect against binary only formats
1396 return 1;
1397 }
1398
1399 &FileUtils::removeFiles("$output_filestem.text");
1400 return 0;
1401
1402}
Note: See TracBrowser for help on using the repository browser.