root/main/trunk/greenstone2/bin/script/gsConvert.pl @ 32263

Revision 32263, 42.7 KB (checked in by ak19, 2 years ago)

gsConvert.pl's _get_xpdftools_bindir() is now drastically reduced in size as we're now working with our self-compiled xpdf-tools binaries on 32 bit linux, and the pre-compiled windows 32 bit binaries, and won't need to choose between bin32 and bin64 subfolders. Tomorrow will test whether the mac bins work on Mountain Lion and the newer OS, in which case there will be a universal xpdf-tools/bin directory for Mac as well.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs.  The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use strict;
52
53use parsargv;
54use util;
55use FileUtils;
56use Cwd;
57
58# Are we running on WinNT or Win2000 (or later)?
59my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62my $use_strings;
63my $pdf_complex;
64my $pdf_nohidden;
65my $pdf_zoom;
66my $pdf_ignore_images;
67my $pdf_allow_images_only;
68my $windows_scripting;
69my $enc;
70
71sub print_usage
72{
73    print STDERR "\n";
74    print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
75    print STDERR "              or text using third-party programs.\n\n";
76    print STDERR "  usage: $0 [options] filename\n";
77    print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
78    print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
79    print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
80    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
81    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
82    print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
83    print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
84    print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
85    print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
86    print STDERR "\t\tconverting PDF to HTML\n";
87    print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
88    print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
89    print STDERR "\t\t-pdf_complex is set\n";
90    exit(1);
91}
92
93my $faillogfile="";
94my $timeout=0;
95my $verbosity=0;
96
97sub main
98{
99    my (@ARGV) = @_;
100    my ($input_type,$output_type,$verbose);
101
102    # Dynamically figure out what the --type option can support, based on whether -windows_scripting
103    # is in use or not
104    my $default_type_re = "(doc|dot|pdf|ps|ppt|rtf|xls)";
105    #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
106    #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
107    # Currently only have VBA for Word and PPT(but no XLS)
108    my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xls)";
109
110    my $type_re = $default_type_re;
111   
112    foreach my $a (@ARGV) {
113        if ($a =~ m/^windows_scripting$/i) {
114            $type_re = $enhanced_type_re;
115        }
116    }
117   
118    # read command-line arguments
119    if (!parsargv::parse(\@ARGV,
120             "type/$type_re/", \$input_type,
121             '/errlog/.*/', \$faillogfile,
122             'output/(auto|html|text|pagedimg).*/', \$output_type,
123             'timeout/\d+/0',\$timeout,
124             'verbose/\d+/0', \$verbose,
125             'windows_scripting',\$windows_scripting,
126             'use_strings', \$use_strings,
127             'pdf_complex', \$pdf_complex,
128             'pdf_ignore_images', \$pdf_ignore_images,
129             'pdf_allow_images_only', \$pdf_allow_images_only,
130             'pdf_nohidden', \$pdf_nohidden,
131             'pdf_zoom/\d+/2', \$pdf_zoom
132             ))
133    {
134    print_usage();
135    }
136
137    $verbosity=$verbose if defined $verbose;
138     
139    # Make sure the input file exists and can be opened for reading
140    if (scalar(@ARGV!=1)) {
141    print_usage();
142    }
143
144    my $input_filename = $ARGV[0];
145    if (!-r $input_filename) {
146    print STDERR "Error: unable to open $input_filename for reading\n";
147    exit(1);
148    }
149
150    # Deduce filenames
151    my ($tailname,$dirname,$suffix)
152    = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
153    my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
154
155    if ($input_type eq "")
156    {
157    $input_type = lc (substr($suffix,1,length($suffix)-1));
158    }
159   
160    # Change to temporary working directory
161    my $stored_dir = cwd();
162    chdir ($dirname) || die "Unable to change to directory $dirname";
163
164    # Select convert utility
165    if (!defined $input_type) {
166    print STDERR "Error: No filename extension or input type defined\n";
167    exit(1);
168    }
169    elsif ($input_type =~ m/^docx?$/ || $input_type eq "dot") {
170    print &convertDOC($input_filename, $output_filestem, $output_type);
171    print "\n";
172    }
173    elsif ($input_type eq "rtf") {
174    print &convertRTF($input_filename, $output_filestem, $output_type);
175    print "\n";
176    }
177    elsif ($input_type eq "pdf") {
178    print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
179    print "\n";
180    }
181    elsif ($input_type eq "ps") {
182    print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
183    print "\n";
184    }
185    elsif ($input_type =~ m/pptx?$/) {
186    print &convertPPT($input_filename, $output_filestem, $output_type);
187    print "\n";
188    }
189    elsif ($input_type =~ m/xlsx?$/) {
190    print &convertXLS($input_filename, $output_filestem, $output_type);
191    print "\n";
192    }
193    else {
194    print STDERR "Error: Unable to convert type '$input_type'\n";
195    exit(1);
196    }
197   
198    # restore to original working directory
199    chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
200
201}
202
203&main(@ARGV);
204
205
206
207# Document-type conversion functions
208#
209# The following functions attempt to convert documents from their
210# input type to the specified output type.  If no output type was
211# given, then they first attempt HTML, and then TEXT.
212#
213# Each returns the output type ("html" or "text") or "fail" if no
214# conversion is possible.
215
216# Convert a Microsoft word document
217
218sub convertDOC {
219    my ($input_filename, $output_filestem, $output_type) = @_;
220
221    # Many .doc files are not in fact word documents!
222    my $realtype = &find_docfile_type($input_filename);
223
224    if ($realtype eq "word6" || $realtype eq "word7"
225        || $realtype eq "word8" || $realtype eq "docx") {
226    return &convertWord678($input_filename, $output_filestem, $output_type);
227    } elsif ($realtype eq "rtf") {
228    return &convertRTF($input_filename, $output_filestem, $output_type);
229    } else {
230    return &convertAnything($input_filename, $output_filestem, $output_type);
231    }
232}
233
234# Convert a Microsoft word 6/7/8 document
235
236sub convertWord678 {
237    my ($input_filename, $output_filestem, $output_type) = @_;
238
239    my $success = 0;
240    if (!$output_type || ($output_type =~ m/html/i)){
241    if ($windows_scripting) {
242        $success = &native_doc_to_html($input_filename, $output_filestem);
243    }
244    else {
245        $success = &doc_to_html($input_filename, $output_filestem);   
246    }
247    if ($success) {
248       return "html";
249    }
250    }
251    return &convertAnything($input_filename, $output_filestem, $output_type);
252}
253
254
255# Convert a Rich Text Format (RTF) file
256
257sub convertRTF {
258    my ($input_filename, $output_filestem, $output_type) = @_;
259
260    my $success = 0;
261
262    # Attempt specialised conversion to HTML
263    if (!$output_type || ($output_type =~ m/html/i)) {
264
265    if ($windows_scripting) {
266        $success = &native_doc_to_html($input_filename, $output_filestem);
267    }
268    else {
269        $success = &rtf_to_html($input_filename, $output_filestem);
270    }
271    if ($success) {
272        return "html";
273    }
274    }
275
276# rtf is so ugly that's it's not worth running strings over.
277# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
278#    return &convertAnything($input_filename, $output_filestem, $output_type);
279    return "fail";
280}
281
282
283# Convert an unidentified file
284
285sub convertAnything {
286    my ($input_filename, $output_filestem, $output_type) = @_;
287   
288    my $success = 0;
289 
290    # Attempt simple conversion to HTML
291    if (!$output_type || ($output_type =~ m/html/i)) {
292    $success = &any_to_html($input_filename, $output_filestem);
293    if ($success) {
294        return "html";
295    }
296    }
297
298    # Convert to text
299    if (!$output_type || ($output_type =~ m/text/i)) {
300    $success = &any_to_text($input_filename, $output_filestem);
301    if ($success) {
302        return "text";
303    }
304    }
305    return "fail";
306}
307
308
309
310# Convert an Adobe PDF document
311
312sub convertPDF {
313    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
314
315    my $success = 0;
316    $output_type =~ s/.*\-(.*)/$1/i;
317    # Attempt coversion to Image
318    if ($output_type =~ m/jp?g|gif|png/i) {
319    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
320    if ($success){
321        return "item";
322    }
323    }
324
325    # Attempt conversion to HTML
326    # Uses the old pdftohtml that doesn't work for newer PDF versions
327    if ($output_type =~ m/^html/i) {
328    #if (!$output_type || ($output_type =~ m/^html/i)) {
329    $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
330    if ($success) {
331        return "html";
332    }
333    }
334
335    # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This
336    # will be the new default for PDFs when output_type for PDF docs is not specified
337    # (once our use of xpdftools' pdftohtml has been implemented on win and mac).
338    #if ($output_type =~ m/paged_html/i) {
339    if (!$output_type || ($output_type =~ m/paged_html/i)) {
340    $success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
341    if ($success) {
342        return "paged_html";
343    }
344    }
345
346    # Attempt conversion to TEXT
347    if (!$output_type || ($output_type =~ m/text/i)) {
348        $success = &xpdf_to_text($dirname, $input_filename, $output_filestem);
349        #if ($ENV{'GSDLOS'} =~ m/^windows$/i) { # we now have pdf to text support for windows by using xpdf tools
350        #   $success = &xpdf_to_text($dirname, $input_filename, $output_filestem);
351        #} else {
352        #   $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
353        #}
354    if ($success) {
355        return "text";
356    }
357    }
358
359    return "fail";
360
361}
362
363
364# Convert an Adobe PostScript document
365
366sub convertPS {
367    my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
368
369    my $success = 0;
370    $output_type =~ s/.*\-(.*)/$1/i;
371    # Attempt coversion to Image
372    if ($output_type =~ m/jp?g|gif|png/i) {
373    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
374    if ($success){
375        return "item";
376    }
377    }
378
379    # Attempt conversion to TEXT
380    if (!$output_type || ($output_type =~ m/text/i)) {
381    $success = &ps_to_text($input_filename, $output_filestem);
382    if ($success) {
383        return "text";
384    }
385    }
386    return "fail";
387}
388
389
390sub convertPPT {
391    my ($input_filename, $output_filestem, $output_type) = @_;
392    my $success = 0;
393
394    my $ppt_convert_type = "";
395
396    #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
397    if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
398    if ($output_type =~ m/gif/i) {
399        $ppt_convert_type = "-g";
400    } elsif ($output_type =~ m/jp?g/i){
401        $ppt_convert_type = "-j";
402    } elsif ($output_type =~ m/png/i){
403        $ppt_convert_type = "-p";
404    }
405    my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
406                       $ENV{'GSDLOS'}, "pptextract");
407    $vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
408    # $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
409           
410    my $cmd = "";
411    if ($timeout) {$cmd = "ulimit -t $timeout;";}
412    # if the converting directory already exists
413    if (-d $output_filestem) {
414        print STDERR "**The conversion directory already exists\n";
415        return "item";
416    } else {
417        $cmd .=  "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
418        $cmd .= " 2>\"$output_filestem.err\""
419        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
420
421        if (system($cmd) !=0) {
422        print STDERR "Powerpoint VB Scripting convert failed\n";
423        } else {
424        return "item";
425        }
426    }
427    } elsif (!$output_type || ($output_type =~ m/html/i)) {
428    # Attempt conversion to HTML
429    #if (!$output_type || ($output_type =~ m/html/i)) {
430    # formulate the command
431    my $cmd = "";
432    my $full_perl_path = &util::get_perl_exec();
433    $cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
434    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
435    $cmd .= " 2>\"$output_filestem.err\""
436        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
437
438    # execute the command
439    $!=0;
440    if (system($cmd)!=0)
441    {
442        print STDERR "Powerpoint 95/97 converter failed $!\n";
443    } else {
444        return "html";
445    }
446    }
447
448    $success = &any_to_text($input_filename, $output_filestem);
449    if ($success) {
450    return "text";
451    }
452   
453    return "fail";
454}
455
456
457sub convertXLS {
458    my ($input_filename, $output_filestem, $output_type) = @_;
459
460    my $success = 0;
461
462    # Attempt conversion to HTML
463    if (!$output_type || ($output_type =~ m/html/i)) {
464    # formulate the command
465    my $cmd = "";
466    my $full_perl_path = &util::get_perl_exec();
467    $cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
468    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
469    $cmd .= " 2>\"$output_filestem.err\""
470        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
471   
472   
473    # execute the command
474    $!=0;
475    if (system($cmd)!=0)
476    {
477        print STDERR "Excel 95/97 converter failed $!\n";
478    } else {
479        return "html";
480    }
481    }
482
483    $success = &any_to_text($input_filename, $output_filestem);
484    if ($success) {
485    return "text";
486    }
487
488    return "fail";
489}
490
491
492
493# Find the real type of a .doc file
494#
495# We seem to have a lot of files with a .doc extension that are .rtf
496# files or Word 5 files.  This function attempts to tell the difference.
497sub find_docfile_type {
498    my ($input_filename) = @_;
499   
500    if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
501        return "docx";
502    }
503   
504    open(CHK, "<$input_filename");
505    binmode(CHK);
506    my $line = "";
507    my $first = 1;
508
509    while (<CHK>) {
510   
511    $line = $_;
512
513    if ($first) {
514        # check to see if this is an rtf file
515        if ($line =~ m/^\{\\rtf/) {
516        close(CHK);
517        return "rtf";
518        }
519        $first = 0;
520    }
521   
522    # is this is a word 6/7/8 document?
523    if ($line =~ m/Word\.Document\.([678])/) {
524        close(CHK);
525
526        return "word$1";
527    }
528
529    }
530
531    return "unknown";
532}
533
534
535# Specific type-to-type conversions
536#
537# Each of the following functions attempts to convert a document from
538# a specific format to another.  If they succeed they return 1 and leave
539# the output document(s) in the appropriate place; if they fail they
540# return 0 and delete any working files.
541
542
543# Attempt to convert a word document to html with the wv program
544sub doc_to_html {
545    my ($input_filename, $output_filestem) = @_;
546
547    my $wvware_status = 0;
548   
549    # need to ensure that the path to perl is quoted (in case there's spaces in it)
550    my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";   
551
552#    print STDERR "***** wvware launch cmd = $launch_cmd\n";
553
554    $wvware_status = system($launch_cmd)/256;
555    return $wvware_status;
556}
557
558# Attempt to convert a word document to html with the word2html scripting program
559sub native_doc_to_html {
560    my ($input_filename, $output_filestem) = @_;
561
562    # build up the path to the doc-to-html conversion tool we're going to use
563    my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
564
565    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
566        # if windows scripting with docx input, use new VBscript to get the local Word install (if
567        # any) to do the conversion, since docX can't be processed by word2html's windows_scripting
568       
569        if($input_filename =~ m/docx$/i) {  # need to use full path to docx2html script,
570                                            # else script launch fails when there are error msgs
571            $vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
572            $vbScript = "CScript //Nologo \"$vbScript\"";   # launch with CScript for error output in STDERR
573                                    # //Nologo flag avoids Microsoft's opening/logo msgs
574            print STDERR "About to use windows scripting to process docx file $input_filename.\n";
575            print STDERR "   This may take some time. Please wait...\n";
576        }
577        else {  # old doc versions. use the usual VB executable word2html for the
578                # conversion. Doesn't need full path, since bin\windows is on PATH         
579            $vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
580        }
581    }
582    else { # not windows
583        $vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
584    }
585
586    if (-e "$output_filestem.html") {
587    print STDERR "    The conversion file:\n";
588    print STDERR "      $output_filestem.html\n";
589    print STDERR "    ... already exists.  Skipping\n";
590    return 1;
591    }
592
593    my $cmd = "";
594    if ($timeout) {$cmd = "ulimit -t $timeout;";}
595    #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
596    #$cmd .=  "$vbScript $input_filename $output_filestem.html";
597    $cmd .=  "$vbScript \"$input_filename\" \"$output_filestem.html\"";
598
599    # redirecting STDERR
600   
601    $cmd .= " 2> \"$output_filestem.err\""
602        if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);   
603    #print STDERR "@@@@@@@@@ cmd=$cmd\n";
604   
605    # execute the command
606    $!=0;
607    if (system($cmd)!=0)
608    {
609    print STDERR "Error executing $vbScript converter:$!\n";
610    if (-s "$output_filestem.err") {
611        open (ERRFILE, "<$output_filestem.err");
612       
613        my $write_to_fail_log=0;
614        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
615        {$write_to_fail_log=1;}
616
617        my $line;
618        while ($line=<ERRFILE>) {
619        if ($line =~ m/\w/) {
620            print STDERR "$line";
621            print FAILLOG "$line" if ($write_to_fail_log);
622        }
623        if ($line !~ m/startup error/) {next;}
624        print STDERR " (given an invalid .DOC file?)\n";
625        print FAILLOG " (given an invalid .DOC file?)\n"
626        if ($write_to_fail_log);
627       
628        } # while ERRFILE
629        close FAILLOG if ($write_to_fail_log);
630    }
631    return 0; # we can try any_to_text
632    }
633
634    # Was the conversion successful?
635    if (-s "$output_filestem.html") {
636    open(TMP, "$output_filestem.html");
637    my $line = <TMP>;
638    close(TMP);
639    if ($line && $line =~ m/html/i) {
640        &FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
641        return 1;
642    }
643    }
644   
645    # If here, an error of some sort occurred
646    &FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
647    if (-e "$output_filestem.err") {
648    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
649        open (ERRLOG,"$output_filestem.err");
650        while (<ERRLOG>) {print FAILLOG $_;}
651        close FAILLOG;
652        close ERRLOG;
653    }
654    &FileUtils::removeFiles("$output_filestem.err");
655    }
656    return 0;
657}
658
659# Attempt to convert an RTF document to html with rtftohtml
660sub rtf_to_html {
661    my ($input_filename, $output_filestem) = @_;
662
663    # formulate the command
664    my $cmd = "";
665    if ($timeout) {$cmd = "ulimit -t $timeout;";}
666    $cmd .= "rtftohtml";
667    #$cmd .= "rtf-converter";
668
669    $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
670
671    $cmd .= " 2>\"$output_filestem.err\""
672        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
673
674
675    # execute the command
676    $!=0;
677    if (system($cmd)!=0)
678    {
679    print STDERR "Error executing rtf converter $!\n";
680    # don't currently bother printing out error log...
681    # keep going, in case it still created an HTML file...
682    }
683
684    # Was the conversion successful?
685    my $was_successful=0;
686    if (-s "$output_filestem.html") {
687    # make sure we have some content other than header
688    open (HTML, "$output_filestem.html"); # what to do if fail?
689    my $line;
690    my $past_header=0;
691    while ($line=<HTML>) {
692
693        if ($past_header == 0) {
694        if ($line =~ m/<body>/) {$past_header=1;}
695        next;
696        }
697
698        $line =~ s/<[^>]+>//g;
699        if ($line =~ m/\w/ && $past_header) {  # we found some content...
700        $was_successful=1;
701        last;
702        }
703    }
704    close HTML;
705    }
706
707    if ($was_successful) {
708    &FileUtils::removeFiles("$output_filestem.err")
709        if (-e "$output_filestem.err");
710    # insert the (modified) table of contents, if it exists.
711    if (-e "${output_filestem}_ToC.html") {
712        &FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
713        my $open_failed=0;
714        open HTMLSRC, "$output_filestem.src" || ++$open_failed;
715        open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
716        open HTML, ">$output_filestem.html" || ++$open_failed;
717       
718        if ($open_failed) {
719        close HTMLSRC;
720        close TOC;
721        close HTML;
722        &FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
723        return 1;
724        }
725
726        # print out header info from src html.
727        while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
728        print HTML "$_";
729        }
730
731        # print out table of contents, making links relative
732        <TOC>; <TOC>; # ignore first 2 lines
733        print HTML scalar(<TOC>); # line 3 = "<ol>\n"
734        my $line;
735        while ($line=<TOC>) {
736        $line =~ s@</body></html>$@@i ; # only last line has this
737        # make link relative
738        $line =~ s@href=\"[^\#]+@href=\"@i;
739        print HTML $line;
740        }
741        close TOC;
742
743        # rest of html src
744        while (<HTMLSRC>) {
745        print HTML $_;
746        }
747        close HTMLSRC;
748        close HTML;
749
750        &FileUtils::removeFiles("${output_filestem}_ToC.html");
751        &FileUtils::removeFiles("${output_filestem}.src");
752    }
753    # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
754    return 1; # success
755    }
756
757    if (-e "$output_filestem.err") {
758    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
759    {
760        print FAILLOG "Error - rtftohtml - couldn't extract text\n";
761        #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
762        print FAILLOG " (rtf file might be too recent):\n";
763        open (ERRLOG, "$output_filestem.err");
764        while (<ERRLOG>) {print FAILLOG $_;}
765        close ERRLOG;
766        close FAILLOG;
767    }
768    &FileUtils::removeFiles("$output_filestem.err");
769    }
770
771    &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
772
773    return 0;
774}
775
776
777# Convert a pdf file to html with the old pdftohtml command
778# which only works for older PDF versions
779sub pdf_to_html {
780    my ($dirname, $input_filename, $output_filestem) = @_;
781
782    my $cmd = "";
783    if ($timeout) {$cmd = "ulimit -t $timeout;";}
784    my $full_perl_path = &util::get_perl_exec();
785    $cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
786    $cmd .= " -c" if ($pdf_complex);
787    $cmd .= " -i" if ($pdf_ignore_images);
788    $cmd .= " -a" if ($pdf_allow_images_only);
789    $cmd .= " -hidden" unless ($pdf_nohidden);
790    $cmd .= " \"$input_filename\" \"$output_filestem\"";
791   
792    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
793    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
794    } else {
795    $cmd .= " > \"$output_filestem.err\"";
796    }
797
798    $!=0;
799
800    my $retval=system($cmd);
801    if ($retval!=0)
802    {
803    print STDERR "Error executing pdftohtml.pl";
804    if ($!) {print STDERR ": $!";}
805    print STDERR "\n";
806    }
807
808    # make sure the converter made something
809    if ($retval!=0 || ! -s "$output_filestem.html")
810    {
811    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
812    # print out the converter's std err, if any
813    if (-s "$output_filestem.err") {
814        open (ERRLOG, "$output_filestem.err") || die "$!";
815        print STDERR "pdftohtml error log:\n";
816        while (<ERRLOG>) {
817        print STDERR "$_";
818        }
819        close ERRLOG;
820    }
821    #print STDERR "***********output filestem $output_filestem.html\n";
822    &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
823    if (-e "$output_filestem.err") {
824        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
825        {
826        open (ERRLOG, "$output_filestem.err");
827        while (<ERRLOG>) {print FAILLOG $_;}
828        close ERRLOG;
829        close FAILLOG;
830        }   
831        &FileUtils::removeFiles("$output_filestem.err");
832    }
833    return 0;
834    }
835
836    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
837    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
838    return 1;
839}
840
841
842# Convert a pdf file to html with the newer Xpdftools' pdftohtml
843# This generates "paged HTML" where extracted, selectable text is positioned
844# over screenshots of each page.
845# Since xpdf's pdftohtml fails if the output dir already exists and for easier
846# naming, the output files are created in a "pages" subdirectory of the tmp
847# location parent of $output_filestem instead
848sub xpdf_to_html {
849    my ($dirname, $input_filename, $output_filestem) = @_;
850
851    my $cmd = "";
852
853    # build up the path to the doc-to-html conversion tool we're going to use
854    my $xpdf_pdftohtml = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftohtml");
855   
856    # We'll create the file by name $output_filestem during post-conversion processing.
857    # Note that Xpdf tools will only create its conversion products in a dir that does
858    # not yet exist. So we'll create this location as a subdir of the output_filestem's
859    # parent directory. The parent dir is the already generated tmp area for conversion. So:
860    # - tmpdir gs2build/tmp/<random-num> already exists at this stage
861    # - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
862    # - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
863    my ($tailname, $tmp_dirname, $suffix)
864    = &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
865    $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
866   
867    # xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
868    $cmd .= "\"$xpdf_pdftohtml\"";
869    $cmd .= " -z $pdf_zoom" if ($pdf_zoom);
870#    $cmd .= " -c" if ($pdf_complex);
871#    $cmd .= " -i" if ($pdf_ignore_images);
872#    $cmd .= " -a" if ($pdf_allow_images_only);
873#    $cmd .= " -hidden" unless ($pdf_nohidden);   
874    $cmd .= " \"$input_filename\" \"$tmp_dirname\"";
875    #$cmd .= " \"$input_filename\" \"$output_filestem\"";
876
877    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
878    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
879    } else {
880    $cmd .= " > \"$output_filestem.err\"";
881    }
882
883    #print STDERR "@@@@ Running command: $cmd\n";
884
885    $!=0;
886    my $retval=system($cmd);
887    if ($retval!=0)
888    {
889    print STDERR "Error executing xpdf's pdftohtml tool";
890    if ($!) {print STDERR ": $!";}
891    print STDERR "\n";
892    }
893
894    # make sure the converter made something
895    if ($retval!=0 || ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
896    {
897    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
898    # print out the converter's std err, if any
899    if (-s "$output_filestem.err") {
900        open (ERRLOG, "$output_filestem.err") || die "$!";
901        print STDERR "pdftohtml error log:\n";
902        while (<ERRLOG>) {
903        print STDERR "$_";
904        }
905        close ERRLOG;
906    }
907    #print STDERR "***********output filestem $output_filestem.html\n";
908    &FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
909    if (-e "$output_filestem.err") {
910        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
911        {
912        open (ERRLOG, "$output_filestem.err");
913        while (<ERRLOG>) {print FAILLOG $_;}
914        close ERRLOG;
915        close FAILLOG;
916        }   
917        &FileUtils::removeFiles("$output_filestem.err");
918    }
919    return 0;
920    }
921
922    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
923    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
924    return 1;
925}
926
927# Returns the path to xpdf-tools's containing bin dir appropriate for this machine's OS and bitness
928sub _get_xpdftools_bindir {
929
930    # build up the path to the containing bin dir of the xpdf conversion tool we're going to use
931    my $xpdf_tools_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools", "bin");
932    return $xpdf_tools_bin;
933}
934
935# Convert a pdf file to various types of image with the convert command
936
937sub pdfps_to_img {
938    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
939
940    # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
941    if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
942    my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
943    $imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
944    my $result = `$imagick_cmd identify 2>&1`;
945
946    # Linux and Windows return different values for "program not found".
947    # Linux returns -1 and Windows 256 for "program not found". But once they're
948    # converted to signed values, it will be -1 for Linux and 1 for Windows.
949    # Whenever we test for return values other than 0, shift by 8 and perform
950    # unsigned to signed status conversion on $? to get expected range of return vals
951    # Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
952    # and then exits on that, by the time we get here, we need to do it again
953    my $status = $?;
954    $status >>= 8;
955    $status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);   
956    if (($ENV{'GSDLOS'} ne "windows" && $status == -1) || ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
957        # if ($status == -1 || $status == 1) #if ($status == -1 || $status == 256) {
958        #ImageMagick is not installed, thus the convert utility is not available.
959        print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
960        return 0;
961    }
962    }
963
964    my $cmd = "";
965    if ($timeout) {$cmd = "ulimit -t $timeout;";}
966    $output_type =~ s/.*\_(.*)/$1/i;
967    my $full_perl_path = &util::get_perl_exec();
968    $cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
969    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
970    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
971    } else {
972    $cmd .= " > \"$output_filestem.err\"";
973    }
974
975    # don't include path on windows (to avoid having to play about
976    # with quoting when GSDLHOME might contain spaces) but assume
977    # that the PATH is set up correctly
978    $!=0;
979    my $retval=system($cmd);
980    if ($retval!=0)
981    {
982    print STDERR "Error executing pdfpstoimg.pl";
983    if ($!) {print STDERR ": $!";}
984    print STDERR "\n";
985    }
986
987    #make sure the converter made something
988    #if ($retval !=0) || ! -s "$output_filestem")
989    if ($retval !=0)
990    {
991    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
992    #print out the converter's std err, if any
993    if (-s "$output_filestem.err") {
994        open (ERRLOG, "$output_filestem.err") || die "$!";
995        print STDERR "pdfpstoimg error log:\n";
996        while (<ERRLOG>) {
997        print STDERR "$_";
998        }
999        close ERRLOG;
1000    }
1001    #&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
1002    if (-e "$output_filestem.err") {
1003        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1004        {
1005        open (ERRLOG, "$output_filestem.err");
1006        while (<ERRLOG>) {print FAILLOG $_;}
1007        close ERRLOG;
1008        close FAILLOG;
1009       }   
1010        &FileUtils::removeFiles("$output_filestem.err");
1011    }
1012    return 0;
1013    }
1014    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1015    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1016    return 1;
1017}
1018
1019# Convert a PDF file to text with xpdftools' pdftotext command
1020# Works for Windows too, whereas the old pdftotxt didn't
1021sub xpdf_to_text {
1022    my ($dirname, $input_filename, $output_filestem) = @_;
1023
1024    my $cmd = "";
1025
1026    # build up the path to the doc-to-txt conversion tool we're going to use
1027    my $xpdf_pdftotxt = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftotext");
1028   
1029    # For xpdf's pdftotxt options, see https://www.xpdfreader.com/pdftotext-man.html
1030    $cmd .= "\"$xpdf_pdftotxt\"";
1031    if($enc) {
1032        $cmd .= " -enc $enc"; # decode the bytes in the file using the designated encoding scheme
1033    } else {
1034        # as per https://www.xpdfreader.com/pdftotext-man.html
1035        # xpdf's pdftotxt defaults to using Latin-1 encoding, should we default to UTF-8?
1036        $cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html
1037    }
1038    $cmd .= " -nopgbrk";
1039    # Avoid the silly solitary carriage returns (CR in Notepad) at the end
1040    # of lines that ends up as \n appended to the doc title
1041    # by setting the end of line marker to unix style solitary newline (LF or \n),
1042    # which doesn't end up in the doc title
1043    $cmd .= " -eol unix";
1044    $cmd .= " \"$input_filename\" \"$output_filestem.text\"";   
1045
1046    print STDERR "@@@@ Running command: $cmd\n";
1047   
1048    return _run_pdf_to_text_cmd($cmd, $output_filestem);
1049}
1050
1051# Convert a PDF file to text with the pdftotext command
1052
1053sub pdf_to_text {
1054    my ($dirname, $input_filename, $output_filestem) = @_;
1055
1056    my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1057   
1058    return _run_pdf_to_text_cmd($cmd, $output_filestem);
1059}
1060
1061sub _run_pdf_to_text_cmd {
1062    my ($cmd, $output_filestem) = @_;
1063
1064    if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1065    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1066    } else {
1067    $cmd .= " > \"$output_filestem.err\"";
1068    }
1069   
1070    if (system($cmd)!=0)
1071    {
1072    print STDERR "Error executing $cmd: $!\n";
1073    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1074    }
1075
1076    # make sure there is some extracted text.
1077    if (-e "$output_filestem.text") {
1078    open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
1079    binmode(EXTR_TEXT); # just in case...
1080    my $line="";
1081    my $seen_text=0;
1082    while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1083        if ($line=~ m/\w/) {$seen_text=1;}
1084    }
1085    close EXTR_TEXT;
1086    if ($seen_text==0) { # no text was extracted
1087        print STDERR "Error: pdftotext found no text\n";
1088        &FileUtils::removeFiles("$output_filestem.text");
1089    }
1090    }
1091
1092    # make sure the converter made something
1093    if (! -s "$output_filestem.text")
1094    {
1095    # print out the converters std err, if any
1096    if (-s "$output_filestem.err") {
1097        open (ERRLOG, "$output_filestem.err") || die "$!";
1098        print STDERR "pdftotext error log:\n";
1099        while (<ERRLOG>) {
1100        print STDERR "$_";
1101        }
1102        close ERRLOG;
1103    }
1104    # does this converter create a .out file?
1105    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1106    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1107    if (-e "$output_filestem.err") {
1108        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1109        {
1110        open (ERRLOG,"$output_filestem.err");
1111        while (<ERRLOG>) {print FAILLOG $_;}
1112        close ERRLOG;
1113        close FAILLOG;
1114        }
1115        &FileUtils::removeFiles("$output_filestem.err");
1116    }
1117    return 0;
1118    }
1119    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1120    return 1;
1121}
1122
1123# Convert a PostScript document to text
1124# note - just using "ps2ascii" isn't good enough, as it
1125# returns 0 for a postscript interpreter error. ps2ascii is just
1126# a wrapper to "gs" anyway, so we use that cmd here.
1127
1128sub ps_to_text {
1129    my ($input_filename, $output_filestem) = @_;
1130
1131    my $error = "";
1132
1133    # if we're on windows we'll fall straight through without attempting
1134    # to use gs
1135    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1136    $error = "Windows does not support gs";
1137
1138    } else {
1139    my $cmd = "";
1140    if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1141    $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1142    $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1143    #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1144    $cmd .= " 2> $output_filestem.err";
1145    $!=0;
1146
1147    my $retcode=system($cmd);
1148    $retcode = $? >> 8;  # see man perlfunc - system for this...
1149    # if system returns -1 | 127 (couldn't start program), look at $! for message
1150
1151    if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1152    elsif (! -e "$output_filestem.text") {
1153        $error="did not create output file.\n";
1154    }
1155    else
1156    {   # make sure the interpreter didn't get an error. It is technically
1157        # possible for the actual text to start with this, but....
1158        open PSOUT, "$output_filestem.text";
1159        if (<PSOUT> =~ m/^Error: (.*)/) {
1160        $error="interpreter error - \"$1\"";
1161        }
1162        close PSOUT;
1163    }
1164    }
1165
1166    if ($error ne "")
1167    {
1168    print STDERR "Warning: Error executing gs: $error\n";
1169    print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
1170    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1171
1172    if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1173    {
1174        print FAILLOG "gs - $error\n";
1175        if (-e "$output_filestem.err") {
1176        open(ERRLOG, "$output_filestem.err");
1177        while (<ERRLOG>) {print FAILLOG $_;}
1178        close ERRLOG;
1179        }
1180        close FAILLOG;
1181    }
1182    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1183
1184
1185    # Fine then. We'll just do a lousy job by ourselves...
1186    # Based on 5-line regexp sed script found at:
1187    # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1188    #
1189    print STDERR "Stripping text from postscript\n";
1190    my $errorcode=0;
1191    open (IN, "$input_filename")
1192        ||  ($errorcode=1, warn "Couldn't read file: $!");
1193    open (OUT, ">$output_filestem.text")
1194        ||  ($errorcode=1, warn "Couldn't write file: $!");
1195    if ($errorcode) {print STDERR "errors\n";return 0;}
1196   
1197    my $text="";  # this is for whole .ps file...
1198    $text = join('', <IN>); # see man perlport, under "System Resources"
1199    close IN;
1200
1201    # Make sure this is a ps file...
1202    if ($text !~ m/^%!/) {
1203        print STDERR "Bad postscript header: not '%!'\n";
1204        if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1205        {
1206        print FAILLOG "Bad postscript header: not '%!'\n";
1207        close FAILLOG;
1208        }
1209        return 0;
1210    }
1211
1212    # if ps has Page data, then use it to delete all stuff before it.
1213    $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1214   
1215    # remove all leading non-data stuff
1216    $text =~ s/^.*?\(//s;
1217
1218    # remove all newline chars for easier processing
1219    $text =~ s/\n//g;
1220   
1221    # Big assumption here - assume that if any co-ordinates are
1222    # given, then we are at the end of a sentence.
1223    $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1224
1225    # special characters--
1226    $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1227
1228    # ? ps text formatting (eg italics?) ?
1229    $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1230    $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1231    $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1232    # default - remove the rest
1233    $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1234
1235    # attempt to add whitespace between words...
1236    # this is based purely on observation, and may be completely wrong...
1237    $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1238    # eg I notice "b(" is sometimes NOT a space if preceded by a
1239    # negative number.
1240    $text =~ s/\)\d+ ?b\(/\) \( /g;
1241
1242    # change quoted braces to brackets
1243    $text =~ s/([^\\])\\\(/$1\{/g;
1244    $text =~ s/([^\\])\\\)/$1\}/g ;
1245
1246    # remove everything that is not between braces
1247    $text =~ s/\)([^\(\)])+?\(//sg ;
1248   
1249    # remove any Trailer eof stuff.
1250    $text =~ s/\)[^\)]*$//sg;
1251
1252    ### ligatures have special characters...
1253    $text =~ s/\\013/ff/g;
1254    $text =~ s/\\014/fi/g;
1255    $text =~ s/\\015/fl/g;
1256    $text =~ s/\\016/ffi/g;
1257    $text =~ s/\\214/fi/g;
1258    $text =~ s/\\215/fl/g;
1259    $text =~ s/\\017/\n\* /g; # asterisk?
1260    $text =~ s/\\023/\023/g;  # e acute ('e)
1261    $text =~ s/\\177/\252/g;  # u"
1262#   $text =~ s/ ?? /\344/g;  # a"
1263
1264    print OUT "$text";
1265    close OUT;
1266    }
1267    # wrap the text - use a minimum length. ie, first space after this length.
1268    my $wrap_length=72;
1269    &FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1270    open INFILE, "$output_filestem.text.tmp" ||
1271    die "Couldn't open file: $!";
1272    open OUTFILE, ">$output_filestem.text" ||
1273    die "Couldn't open file for writing: $!";
1274    my $line="";
1275    while ($line=<INFILE>) {
1276    while (length($line)>0) {
1277        if (length($line)>$wrap_length) {
1278        $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1279        print OUTFILE "$1\n";
1280        } else {
1281        print OUTFILE "$line";
1282        $line="";
1283        }
1284    }
1285    }
1286    close INFILE;
1287    close OUTFILE;
1288    &FileUtils::removeFiles("$output_filestem.text.tmp");
1289
1290    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1291    return 1;
1292}
1293
1294
1295# Convert any file to HTML with a crude perl implementation of the
1296# UNIX strings command.
1297
1298sub any_to_html {
1299    my ($input_filename, $output_filestem) = @_;
1300
1301    # First generate a text file
1302    return 0 unless (&any_to_text($input_filename, $output_filestem));
1303
1304    # create an HTML file from the text file
1305    open(TEXT, "<$output_filestem.text");
1306    open(HTML, ">$output_filestem.html");
1307
1308    print HTML "<html><head>\n";
1309    print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1310    print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1311    print HTML "</head><body>\n\n";
1312
1313    my $line;
1314    while ($line=<TEXT>) {
1315    $line =~ s/</&lt;/g;
1316    $line =~ s/>/&gt;/g;
1317    if ($line =~ m/^\s*$/) {
1318        print HTML "<p>";
1319    } else {
1320        print HTML "<br> ", $line;
1321    }
1322    }
1323    print HTML "\n</body></html>\n";
1324
1325    close HTML;
1326    close TEXT;
1327
1328    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1329    return 1;
1330}
1331
1332# Convert any file to TEXT with a crude perl implementation of the
1333# UNIX strings command.
1334# Note - this assumes ascii charsets :(     (jrm21)
1335
1336sub any_to_text {
1337    my ($input_filename, $output_filestem) = @_;
1338
1339    if (!$use_strings) {
1340      return 0;
1341    }
1342
1343    print STDERR "\n**** In any to text****\n\n";
1344    open(IN, "<$input_filename") || return 0;
1345    binmode(IN);
1346    open(OUT, ">$output_filestem.text") || return 0;
1347
1348    my ($line);
1349    my $output_line_count = 0;
1350    while (<IN>) {
1351    $line = $_;
1352
1353    # delete anything that isn't a printable character
1354    $line =~ s/[^\040-\176]+/\n/sg;
1355
1356    # delete any string less than 10 characters long
1357    $line =~ s/^.{0,9}$/\n/mg;
1358    while ($line =~ m/^.{1,9}$/m) {
1359        $line =~ s/^.{0,9}$/\n/mg;
1360        $line =~ s/\n+/\n/sg;
1361    }
1362
1363    # remove extraneous whitespace
1364    $line =~ s/\n+/\n/gs;
1365    $line =~ s/^\n//gs;
1366
1367    # output whatever is left
1368    if ($line =~ m/[^\n ]/) {
1369        print OUT $line;
1370        ++$output_line_count;
1371    }
1372    }
1373
1374    close OUT;
1375    close IN;
1376
1377    if ($output_line_count) { # try to protect against binary only formats
1378    return 1;
1379    }
1380
1381    &FileUtils::removeFiles("$output_filestem.text");
1382    return 0;
1383
1384}
Note: See TracBrowser for help on using the browser.