root/main/trunk/greenstone2/bin/script/gsConvert.pl @ 32273

Revision 32273, 43.5 KB (checked in by ak19, 23 months ago)

First of the commits to do with restructuring and refactoring the PDFPlugin. 1. Introducing PDFv1Plugin.pm, which only runs the old pdftohtml. pdfbox_conversion are moved into PDFv2Plugin. 2. In the meantime we still have PDFPlugin, the current state of the plugin, for backward compatibility: it uses both the old pdftohtml tool and still has the pdfbox_conversion option. Yet to introduced the PDFv2Plugin. 3. gsConvert.pl has the new flag pdf_tool, set/passed in by PDFPlugin.pm and all PDFPlugin classes hereafter. The pdf_tool flag can be set to pdftohtml, xpdftools or pdfbox. PDFv1Plugin will always set it to pdftohtml, to denote the old pdftohtml tool is to be used, whereas PDFv2Plugin will set it to xpdftools and PDFBoxConverter sets it for symmetry's sake to pdfbox, even though being an AutoLoadConverter? at present, the PDFBoxConverter class bypasses gsConvert.pl. gsConvert.pl uses the pdf_tool flag to determine which tool is to be used to do the conversion to produce the selected output_type. 4. Added some strings. One for migrating users to indicate that PDFPlugin was being deprecated in favour of the PDFv1 and PDFv2 plugins. Another was referenced by CommonUntil?, and more recently by PDFPlugin, but was not defined in strings.properties. Once PDFv2Plugin has been added, need to remove references to paged_html from PDFPlugin.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs.  The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use strict;
52
53use parsargv;
54use util;
55use FileUtils;
56use Cwd;
57
58# Are we running on WinNT or Win2000 (or later)?
59my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62my $use_strings;
63my $pdf_tool;
64my $pdf_complex;
65my $pdf_nohidden;
66my $pdf_zoom;
67my $pdf_ignore_images;
68my $pdf_allow_images_only;
69my $windows_scripting;
70my $enc;
71
72sub print_usage
73{
74    print STDERR "\n";
75    print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
76    print STDERR "              or text using third-party programs.\n\n";
77    print STDERR "  usage: $0 [options] filename\n";
78    print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
79    print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
80    print STDERR "\t-output\tauto|html|paged_html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
81    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
82    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
83    print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
84    print STDERR "\t-pdf_tool\tpdftohtml|xpdftools|pdfbox (not all output types are supported by every pdf_tool)\n";
85    print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
86    print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
87    print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
88    print STDERR "\t\tconverting PDF to HTML\n";
89    print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
90    print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
91    print STDERR "\t\t-pdf_complex is set\n";
92    exit(1);
93}
94
95my $faillogfile="";
96my $timeout=0;
97my $verbosity=0;
98
99sub main
100{
101    my (@ARGV) = @_;
102    my ($input_type,$output_type,$verbose);
103
104    # Dynamically figure out what the --type option can support, based on whether -windows_scripting
105    # is in use or not
106    my $default_type_re = "(doc|dot|pdf|ps|ppt|rtf|xls)";
107    #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
108    #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
109    # Currently only have VBA for Word and PPT(but no XLS)
110    my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xls)";
111
112    my $type_re = $default_type_re;
113   
114    foreach my $a (@ARGV) {
115        if ($a =~ m/^windows_scripting$/i) {
116            $type_re = $enhanced_type_re;
117        }
118    }
119   
120    # read command-line arguments
121    if (!parsargv::parse(\@ARGV,
122             "type/$type_re/", \$input_type,
123             '/errlog/.*/', \$faillogfile,
124             'output/(auto|html|text|pagedimg).*/', \$output_type, # regex includes html_multi and paged_html besides html
125             'timeout/\d+/0',\$timeout,
126             'verbose/\d+/0', \$verbose,
127             'windows_scripting',\$windows_scripting,
128             'use_strings', \$use_strings,
129             'pdf_tool/(pdftohtml|pdfbox|xpdftools)/', \$pdf_tool, # the old pdftohtml tool, pdfbox extensions or the newer xpdf-tools
130             'pdf_complex', \$pdf_complex, # options for pdf_tool = pdftohtml (the old pdftohtml tool)
131             'pdf_ignore_images', \$pdf_ignore_images,
132             'pdf_allow_images_only', \$pdf_allow_images_only,
133             'pdf_nohidden', \$pdf_nohidden,
134             'pdf_zoom/\d+/2', \$pdf_zoom
135             ))
136    {
137    print_usage();
138    }
139
140    $verbosity=$verbose if defined $verbose;
141     
142    # Make sure the input file exists and can be opened for reading
143    if (scalar(@ARGV!=1)) {
144    print_usage();
145    }
146
147    my $input_filename = $ARGV[0];
148    if (!-r $input_filename) {
149    print STDERR "Error: unable to open $input_filename for reading\n";
150    exit(1);
151    }
152
153    # Deduce filenames
154    my ($tailname,$dirname,$suffix)
155    = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
156    my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
157
158    if ($input_type eq "")
159    {
160    $input_type = lc (substr($suffix,1,length($suffix)-1));
161    }
162   
163    # Change to temporary working directory
164    my $stored_dir = cwd();
165    chdir ($dirname) || die "Unable to change to directory $dirname";
166
167    # Select convert utility
168    if (!defined $input_type) {
169    print STDERR "Error: No filename extension or input type defined\n";
170    exit(1);
171    }
172    elsif ($input_type =~ m/^docx?$/ || $input_type eq "dot") {
173    print &convertDOC($input_filename, $output_filestem, $output_type);
174    print "\n";
175    }
176    elsif ($input_type eq "rtf") {
177    print &convertRTF($input_filename, $output_filestem, $output_type);
178    print "\n";
179    }
180    elsif ($input_type eq "pdf") {
181    print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
182    print "\n";
183    }
184    elsif ($input_type eq "ps") {
185    print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
186    print "\n";
187    }
188    elsif ($input_type =~ m/pptx?$/) {
189    print &convertPPT($input_filename, $output_filestem, $output_type);
190    print "\n";
191    }
192    elsif ($input_type =~ m/xlsx?$/) {
193    print &convertXLS($input_filename, $output_filestem, $output_type);
194    print "\n";
195    }
196    else {
197    print STDERR "Error: Unable to convert type '$input_type'\n";
198    exit(1);
199    }
200   
201    # restore to original working directory
202    chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
203
204}
205
206&main(@ARGV);
207
208
209
210# Document-type conversion functions
211#
212# The following functions attempt to convert documents from their
213# input type to the specified output type.  If no output type was
214# given, then they first attempt HTML, and then TEXT.
215#
216# Each returns the output type ("html" or "text") or "fail" if no
217# conversion is possible.
218
219# Convert a Microsoft word document
220
221sub convertDOC {
222    my ($input_filename, $output_filestem, $output_type) = @_;
223
224    # Many .doc files are not in fact word documents!
225    my $realtype = &find_docfile_type($input_filename);
226
227    if ($realtype eq "word6" || $realtype eq "word7"
228        || $realtype eq "word8" || $realtype eq "docx") {
229    return &convertWord678($input_filename, $output_filestem, $output_type);
230    } elsif ($realtype eq "rtf") {
231    return &convertRTF($input_filename, $output_filestem, $output_type);
232    } else {
233    return &convertAnything($input_filename, $output_filestem, $output_type);
234    }
235}
236
237# Convert a Microsoft word 6/7/8 document
238
239sub convertWord678 {
240    my ($input_filename, $output_filestem, $output_type) = @_;
241
242    my $success = 0;
243    if (!$output_type || ($output_type =~ m/html/i)){
244    if ($windows_scripting) {
245        $success = &native_doc_to_html($input_filename, $output_filestem);
246    }
247    else {
248        $success = &doc_to_html($input_filename, $output_filestem);   
249    }
250    if ($success) {
251       return "html";
252    }
253    }
254    return &convertAnything($input_filename, $output_filestem, $output_type);
255}
256
257
258# Convert a Rich Text Format (RTF) file
259
260sub convertRTF {
261    my ($input_filename, $output_filestem, $output_type) = @_;
262
263    my $success = 0;
264
265    # Attempt specialised conversion to HTML
266    if (!$output_type || ($output_type =~ m/html/i)) {
267
268    if ($windows_scripting) {
269        $success = &native_doc_to_html($input_filename, $output_filestem);
270    }
271    else {
272        $success = &rtf_to_html($input_filename, $output_filestem);
273    }
274    if ($success) {
275        return "html";
276    }
277    }
278
279# rtf is so ugly that's it's not worth running strings over.
280# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
281#    return &convertAnything($input_filename, $output_filestem, $output_type);
282    return "fail";
283}
284
285
286# Convert an unidentified file
287
288sub convertAnything {
289    my ($input_filename, $output_filestem, $output_type) = @_;
290   
291    my $success = 0;
292 
293    # Attempt simple conversion to HTML
294    if (!$output_type || ($output_type =~ m/html/i)) {
295    $success = &any_to_html($input_filename, $output_filestem);
296    if ($success) {
297        return "html";
298    }
299    }
300
301    # Convert to text
302    if (!$output_type || ($output_type =~ m/text/i)) {
303    $success = &any_to_text($input_filename, $output_filestem);
304    if ($success) {
305        return "text";
306    }
307    }
308    return "fail";
309}
310
311
312
313# Convert an Adobe PDF document
314
315sub convertPDF {
316    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
317
318    my $success = 0;
319    $output_type =~ s/.*\-(.*)/$1/i;
320
321    # First determine which pdf conversion tool we're using among pdftohtml/pdfbox/xpdftools
322    # and then decide which conversion command to run based on the output type
323    # (pdfbox does not currently go through gsConvert.pl
324    # as PDFBoxConverter inherits from AutoLoadConverters)
325   
326  if ($pdf_tool eq "pdftohtml" ) { # old pdftohtml tool
327    # Attempt coversion to Image
328    if ($output_type =~ m/jp?g|gif|png/i) {
329    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
330    if ($success){
331        return "item";
332    }
333    }
334
335    # Attempt conversion to HTML
336    # Uses the old pdftohtml that doesn't work for newer PDF versions
337    if ($output_type =~ m/^html/i) {
338    #if (!$output_type || ($output_type =~ m/^html/i)) {
339    $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
340    if ($success) {
341        return "html";
342    }
343    }
344
345    # Attempt conversion to TEXT (not for Windows, but PDFPlugin/PDFv1Plugin takes care of that
346    if (!$output_type || ($output_type =~ m/text/i)) {
347    $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
348
349    if ($success) {
350        return "text";
351    }
352    }
353  }
354   
355  elsif ($pdf_tool eq "xpdftools" ) {
356    # default to html output
357    if (!$output_type) {
358        $output_type = "html";
359    }
360   
361    # Attempt coversion to Image
362    #if ($output_type =~ m/jp?g|gif|png/i) {
363    #    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
364    #    if ($success){
365    #   return "item";
366    #    }
367    #}
368   
369    # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools.
370    if ($output_type =~ m/^(paged_html|html)$/i) {
371        $success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
372        if ($success) {
373        return $output_type;
374        }
375    }
376   
377    # Attempt conversion to TEXT
378    if (!$output_type || ($output_type =~ m/text/i)) {     
379        $success = &xpdf_to_text($dirname, $input_filename, $output_filestem);
380       
381        if ($success) {
382        return "text";
383        }
384    }
385  }
386   
387    return "fail";
388
389}
390
391
392# Convert an Adobe PostScript document
393
394sub convertPS {
395    my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
396
397    my $success = 0;
398    $output_type =~ s/.*\-(.*)/$1/i;
399    # Attempt coversion to Image
400    if ($output_type =~ m/jp?g|gif|png/i) {
401    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
402    if ($success){
403        return "item";
404    }
405    }
406
407    # Attempt conversion to TEXT
408    if (!$output_type || ($output_type =~ m/text/i)) {
409    $success = &ps_to_text($input_filename, $output_filestem);
410    if ($success) {
411        return "text";
412    }
413    }
414    return "fail";
415}
416
417
418sub convertPPT {
419    my ($input_filename, $output_filestem, $output_type) = @_;
420    my $success = 0;
421
422    my $ppt_convert_type = "";
423
424    #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
425    if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
426    if ($output_type =~ m/gif/i) {
427        $ppt_convert_type = "-g";
428    } elsif ($output_type =~ m/jp?g/i){
429        $ppt_convert_type = "-j";
430    } elsif ($output_type =~ m/png/i){
431        $ppt_convert_type = "-p";
432    }
433    my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
434                       $ENV{'GSDLOS'}, "pptextract");
435    $vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
436    # $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
437           
438    my $cmd = "";
439    if ($timeout) {$cmd = "ulimit -t $timeout;";}
440    # if the converting directory already exists
441    if (-d $output_filestem) {
442        print STDERR "**The conversion directory already exists\n";
443        return "item";
444    } else {
445        $cmd .=  "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
446        $cmd .= " 2>\"$output_filestem.err\""
447        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
448
449        if (system($cmd) !=0) {
450        print STDERR "Powerpoint VB Scripting convert failed\n";
451        } else {
452        return "item";
453        }
454    }
455    } elsif (!$output_type || ($output_type =~ m/html/i)) {
456    # Attempt conversion to HTML
457    #if (!$output_type || ($output_type =~ m/html/i)) {
458    # formulate the command
459    my $cmd = "";
460    my $full_perl_path = &util::get_perl_exec();
461    $cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
462    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
463    $cmd .= " 2>\"$output_filestem.err\""
464        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
465
466    # execute the command
467    $!=0;
468    if (system($cmd)!=0)
469    {
470        print STDERR "Powerpoint 95/97 converter failed $!\n";
471    } else {
472        return "html";
473    }
474    }
475
476    $success = &any_to_text($input_filename, $output_filestem);
477    if ($success) {
478    return "text";
479    }
480   
481    return "fail";
482}
483
484
485sub convertXLS {
486    my ($input_filename, $output_filestem, $output_type) = @_;
487
488    my $success = 0;
489
490    # Attempt conversion to HTML
491    if (!$output_type || ($output_type =~ m/html/i)) {
492    # formulate the command
493    my $cmd = "";
494    my $full_perl_path = &util::get_perl_exec();
495    $cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
496    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
497    $cmd .= " 2>\"$output_filestem.err\""
498        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
499   
500   
501    # execute the command
502    $!=0;
503    if (system($cmd)!=0)
504    {
505        print STDERR "Excel 95/97 converter failed $!\n";
506    } else {
507        return "html";
508    }
509    }
510
511    $success = &any_to_text($input_filename, $output_filestem);
512    if ($success) {
513    return "text";
514    }
515
516    return "fail";
517}
518
519
520
521# Find the real type of a .doc file
522#
523# We seem to have a lot of files with a .doc extension that are .rtf
524# files or Word 5 files.  This function attempts to tell the difference.
525sub find_docfile_type {
526    my ($input_filename) = @_;
527   
528    if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
529        return "docx";
530    }
531   
532    open(CHK, "<$input_filename");
533    binmode(CHK);
534    my $line = "";
535    my $first = 1;
536
537    while (<CHK>) {
538   
539    $line = $_;
540
541    if ($first) {
542        # check to see if this is an rtf file
543        if ($line =~ m/^\{\\rtf/) {
544        close(CHK);
545        return "rtf";
546        }
547        $first = 0;
548    }
549   
550    # is this is a word 6/7/8 document?
551    if ($line =~ m/Word\.Document\.([678])/) {
552        close(CHK);
553
554        return "word$1";
555    }
556
557    }
558
559    return "unknown";
560}
561
562
563# Specific type-to-type conversions
564#
565# Each of the following functions attempts to convert a document from
566# a specific format to another.  If they succeed they return 1 and leave
567# the output document(s) in the appropriate place; if they fail they
568# return 0 and delete any working files.
569
570
571# Attempt to convert a word document to html with the wv program
572sub doc_to_html {
573    my ($input_filename, $output_filestem) = @_;
574
575    my $wvware_status = 0;
576   
577    # need to ensure that the path to perl is quoted (in case there's spaces in it)
578    my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";   
579
580#    print STDERR "***** wvware launch cmd = $launch_cmd\n";
581
582    $wvware_status = system($launch_cmd)/256;
583    return $wvware_status;
584}
585
586# Attempt to convert a word document to html with the word2html scripting program
587sub native_doc_to_html {
588    my ($input_filename, $output_filestem) = @_;
589
590    # build up the path to the doc-to-html conversion tool we're going to use
591    my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
592
593    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
594        # if windows scripting with docx input, use new VBscript to get the local Word install (if
595        # any) to do the conversion, since docX can't be processed by word2html's windows_scripting
596       
597        if($input_filename =~ m/docx$/i) {  # need to use full path to docx2html script,
598                                            # else script launch fails when there are error msgs
599            $vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
600            $vbScript = "CScript //Nologo \"$vbScript\"";   # launch with CScript for error output in STDERR
601                                    # //Nologo flag avoids Microsoft's opening/logo msgs
602            print STDERR "About to use windows scripting to process docx file $input_filename.\n";
603            print STDERR "   This may take some time. Please wait...\n";
604        }
605        else {  # old doc versions. use the usual VB executable word2html for the
606                # conversion. Doesn't need full path, since bin\windows is on PATH         
607            $vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
608        }
609    }
610    else { # not windows
611        $vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
612    }
613
614    if (-e "$output_filestem.html") {
615    print STDERR "    The conversion file:\n";
616    print STDERR "      $output_filestem.html\n";
617    print STDERR "    ... already exists.  Skipping\n";
618    return 1;
619    }
620
621    my $cmd = "";
622    if ($timeout) {$cmd = "ulimit -t $timeout;";}
623    #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
624    #$cmd .=  "$vbScript $input_filename $output_filestem.html";
625    $cmd .=  "$vbScript \"$input_filename\" \"$output_filestem.html\"";
626
627    # redirecting STDERR
628   
629    $cmd .= " 2> \"$output_filestem.err\""
630        if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);   
631    #print STDERR "@@@@@@@@@ cmd=$cmd\n";
632   
633    # execute the command
634    $!=0;
635    if (system($cmd)!=0)
636    {
637    print STDERR "Error executing $vbScript converter:$!\n";
638    if (-s "$output_filestem.err") {
639        open (ERRFILE, "<$output_filestem.err");
640       
641        my $write_to_fail_log=0;
642        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
643        {$write_to_fail_log=1;}
644
645        my $line;
646        while ($line=<ERRFILE>) {
647        if ($line =~ m/\w/) {
648            print STDERR "$line";
649            print FAILLOG "$line" if ($write_to_fail_log);
650        }
651        if ($line !~ m/startup error/) {next;}
652        print STDERR " (given an invalid .DOC file?)\n";
653        print FAILLOG " (given an invalid .DOC file?)\n"
654        if ($write_to_fail_log);
655       
656        } # while ERRFILE
657        close FAILLOG if ($write_to_fail_log);
658    }
659    return 0; # we can try any_to_text
660    }
661
662    # Was the conversion successful?
663    if (-s "$output_filestem.html") {
664    open(TMP, "$output_filestem.html");
665    my $line = <TMP>;
666    close(TMP);
667    if ($line && $line =~ m/html/i) {
668        &FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
669        return 1;
670    }
671    }
672   
673    # If here, an error of some sort occurred
674    &FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
675    if (-e "$output_filestem.err") {
676    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
677        open (ERRLOG,"$output_filestem.err");
678        while (<ERRLOG>) {print FAILLOG $_;}
679        close FAILLOG;
680        close ERRLOG;
681    }
682    &FileUtils::removeFiles("$output_filestem.err");
683    }
684    return 0;
685}
686
687# Attempt to convert an RTF document to html with rtftohtml
688sub rtf_to_html {
689    my ($input_filename, $output_filestem) = @_;
690
691    # formulate the command
692    my $cmd = "";
693    if ($timeout) {$cmd = "ulimit -t $timeout;";}
694    $cmd .= "rtftohtml";
695    #$cmd .= "rtf-converter";
696
697    $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
698
699    $cmd .= " 2>\"$output_filestem.err\""
700        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
701
702
703    # execute the command
704    $!=0;
705    if (system($cmd)!=0)
706    {
707    print STDERR "Error executing rtf converter $!\n";
708    # don't currently bother printing out error log...
709    # keep going, in case it still created an HTML file...
710    }
711
712    # Was the conversion successful?
713    my $was_successful=0;
714    if (-s "$output_filestem.html") {
715    # make sure we have some content other than header
716    open (HTML, "$output_filestem.html"); # what to do if fail?
717    my $line;
718    my $past_header=0;
719    while ($line=<HTML>) {
720
721        if ($past_header == 0) {
722        if ($line =~ m/<body>/) {$past_header=1;}
723        next;
724        }
725
726        $line =~ s/<[^>]+>//g;
727        if ($line =~ m/\w/ && $past_header) {  # we found some content...
728        $was_successful=1;
729        last;
730        }
731    }
732    close HTML;
733    }
734
735    if ($was_successful) {
736    &FileUtils::removeFiles("$output_filestem.err")
737        if (-e "$output_filestem.err");
738    # insert the (modified) table of contents, if it exists.
739    if (-e "${output_filestem}_ToC.html") {
740        &FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
741        my $open_failed=0;
742        open HTMLSRC, "$output_filestem.src" || ++$open_failed;
743        open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
744        open HTML, ">$output_filestem.html" || ++$open_failed;
745       
746        if ($open_failed) {
747        close HTMLSRC;
748        close TOC;
749        close HTML;
750        &FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
751        return 1;
752        }
753
754        # print out header info from src html.
755        while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
756        print HTML "$_";
757        }
758
759        # print out table of contents, making links relative
760        <TOC>; <TOC>; # ignore first 2 lines
761        print HTML scalar(<TOC>); # line 3 = "<ol>\n"
762        my $line;
763        while ($line=<TOC>) {
764        $line =~ s@</body></html>$@@i ; # only last line has this
765        # make link relative
766        $line =~ s@href=\"[^\#]+@href=\"@i;
767        print HTML $line;
768        }
769        close TOC;
770
771        # rest of html src
772        while (<HTMLSRC>) {
773        print HTML $_;
774        }
775        close HTMLSRC;
776        close HTML;
777
778        &FileUtils::removeFiles("${output_filestem}_ToC.html");
779        &FileUtils::removeFiles("${output_filestem}.src");
780    }
781    # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
782    return 1; # success
783    }
784
785    if (-e "$output_filestem.err") {
786    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
787    {
788        print FAILLOG "Error - rtftohtml - couldn't extract text\n";
789        #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
790        print FAILLOG " (rtf file might be too recent):\n";
791        open (ERRLOG, "$output_filestem.err");
792        while (<ERRLOG>) {print FAILLOG $_;}
793        close ERRLOG;
794        close FAILLOG;
795    }
796    &FileUtils::removeFiles("$output_filestem.err");
797    }
798
799    &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
800
801    return 0;
802}
803
804
805# Convert a pdf file to html with the old pdftohtml command
806# which only works for older PDF versions
807sub pdf_to_html {
808    my ($dirname, $input_filename, $output_filestem) = @_;
809
810    my $cmd = "";
811    if ($timeout) {$cmd = "ulimit -t $timeout;";}
812    my $full_perl_path = &util::get_perl_exec();
813    $cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
814    $cmd .= " -c" if ($pdf_complex);
815    $cmd .= " -i" if ($pdf_ignore_images);
816    $cmd .= " -a" if ($pdf_allow_images_only);
817    $cmd .= " -hidden" unless ($pdf_nohidden);
818    $cmd .= " \"$input_filename\" \"$output_filestem\"";
819   
820    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
821    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
822    } else {
823    $cmd .= " > \"$output_filestem.err\"";
824    }
825
826    $!=0;
827
828    my $retval=system($cmd);
829    if ($retval!=0)
830    {
831    print STDERR "Error executing pdftohtml.pl";
832    if ($!) {print STDERR ": $!";}
833    print STDERR "\n";
834    }
835
836    # make sure the converter made something
837    if ($retval!=0 || ! -s "$output_filestem.html")
838    {
839    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
840    # print out the converter's std err, if any
841    if (-s "$output_filestem.err") {
842        open (ERRLOG, "$output_filestem.err") || die "$!";
843        print STDERR "pdftohtml error log:\n";
844        while (<ERRLOG>) {
845        print STDERR "$_";
846        }
847        close ERRLOG;
848    }
849    #print STDERR "***********output filestem $output_filestem.html\n";
850    &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
851    if (-e "$output_filestem.err") {
852        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
853        {
854        open (ERRLOG, "$output_filestem.err");
855        while (<ERRLOG>) {print FAILLOG $_;}
856        close ERRLOG;
857        close FAILLOG;
858        }   
859        &FileUtils::removeFiles("$output_filestem.err");
860    }
861    return 0;
862    }
863
864    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
865    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
866    return 1;
867}
868
869
870# Convert a pdf file to html with the newer Xpdftools' pdftohtml
871# This generates "paged HTML" where extracted, selectable text is positioned
872# over screenshots of each page.
873# Since xpdf's pdftohtml fails if the output dir already exists and for easier
874# naming, the output files are created in a "pages" subdirectory of the tmp
875# location parent of $output_filestem instead
876sub xpdf_to_html {
877    my ($dirname, $input_filename, $output_filestem) = @_;
878
879    my $cmd = "";
880
881    # build up the path to the doc-to-html conversion tool we're going to use
882    my $xpdf_pdftohtml = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftohtml");
883   
884    # We'll create the file by name $output_filestem during post-conversion processing.
885    # Note that Xpdf tools will only create its conversion products in a dir that does
886    # not yet exist. So we'll create this location as a subdir of the output_filestem's
887    # parent directory. The parent dir is the already generated tmp area for conversion. So:
888    # - tmpdir gs2build/tmp/<random-num> already exists at this stage
889    # - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
890    # - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
891    my ($tailname, $tmp_dirname, $suffix)
892    = &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
893    $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
894   
895    # xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
896    $cmd .= "\"$xpdf_pdftohtml\"";
897    $cmd .= " -z $pdf_zoom" if ($pdf_zoom);
898#    $cmd .= " -c" if ($pdf_complex);
899#    $cmd .= " -i" if ($pdf_ignore_images);
900#    $cmd .= " -a" if ($pdf_allow_images_only);
901#    $cmd .= " -hidden" unless ($pdf_nohidden);   
902    $cmd .= " \"$input_filename\" \"$tmp_dirname\"";
903    #$cmd .= " \"$input_filename\" \"$output_filestem\"";
904
905    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
906    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
907    } else {
908    $cmd .= " > \"$output_filestem.err\"";
909    }
910
911    #print STDERR "@@@@ Running command: $cmd\n";
912
913    $!=0;
914    my $retval=system($cmd);
915    if ($retval!=0)
916    {
917    print STDERR "Error executing xpdf's pdftohtml tool";
918    if ($!) {print STDERR ": $!";}
919    print STDERR "\n";
920    }
921
922    # make sure the converter made something
923    if ($retval!=0 || ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
924    {
925    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
926    # print out the converter's std err, if any
927    if (-s "$output_filestem.err") {
928        open (ERRLOG, "$output_filestem.err") || die "$!";
929        print STDERR "pdftohtml error log:\n";
930        while (<ERRLOG>) {
931        print STDERR "$_";
932        }
933        close ERRLOG;
934    }
935    #print STDERR "***********output filestem $output_filestem.html\n";
936    &FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
937    if (-e "$output_filestem.err") {
938        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
939        {
940        open (ERRLOG, "$output_filestem.err");
941        while (<ERRLOG>) {print FAILLOG $_;}
942        close ERRLOG;
943        close FAILLOG;
944        }   
945        &FileUtils::removeFiles("$output_filestem.err");
946    }
947    return 0;
948    }
949
950    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
951    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
952    return 1;
953}
954
955# Returns the path to xpdf-tools's containing bin dir appropriate for this machine's OS and bitness
956sub _get_xpdftools_bindir {
957
958    # build up the path to the containing bin dir of the xpdf conversion tool we're going to use
959    my $xpdf_tools_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools", "bin");
960    return $xpdf_tools_bin;
961}
962
963# Convert a pdf file to various types of image with the convert command
964
965sub pdfps_to_img {
966    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
967
968    # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
969    if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
970    my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
971    $imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
972    my $result = `$imagick_cmd identify 2>&1`;
973
974    # Linux and Windows return different values for "program not found".
975    # Linux returns -1 and Windows 256 for "program not found". But once they're
976    # converted to signed values, it will be -1 for Linux and 1 for Windows.
977    # Whenever we test for return values other than 0, shift by 8 and perform
978    # unsigned to signed status conversion on $? to get expected range of return vals
979    # Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
980    # and then exits on that, by the time we get here, we need to do it again
981    my $status = $?;
982    $status >>= 8;
983    $status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);   
984    if (($ENV{'GSDLOS'} ne "windows" && $status == -1) || ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
985        # if ($status == -1 || $status == 1) #if ($status == -1 || $status == 256) {
986        #ImageMagick is not installed, thus the convert utility is not available.
987        print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
988        return 0;
989    }
990    }
991
992    my $cmd = "";
993    if ($timeout) {$cmd = "ulimit -t $timeout;";}
994    $output_type =~ s/.*\_(.*)/$1/i;
995    my $full_perl_path = &util::get_perl_exec();
996    $cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
997    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
998    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
999    } else {
1000    $cmd .= " > \"$output_filestem.err\"";
1001    }
1002
1003    # don't include path on windows (to avoid having to play about
1004    # with quoting when GSDLHOME might contain spaces) but assume
1005    # that the PATH is set up correctly
1006    $!=0;
1007    my $retval=system($cmd);
1008    if ($retval!=0)
1009    {
1010    print STDERR "Error executing pdfpstoimg.pl";
1011    if ($!) {print STDERR ": $!";}
1012    print STDERR "\n";
1013    }
1014
1015    #make sure the converter made something
1016    #if ($retval !=0) || ! -s "$output_filestem")
1017    if ($retval !=0)
1018    {
1019    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1020    #print out the converter's std err, if any
1021    if (-s "$output_filestem.err") {
1022        open (ERRLOG, "$output_filestem.err") || die "$!";
1023        print STDERR "pdfpstoimg error log:\n";
1024        while (<ERRLOG>) {
1025        print STDERR "$_";
1026        }
1027        close ERRLOG;
1028    }
1029    #&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
1030    if (-e "$output_filestem.err") {
1031        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1032        {
1033        open (ERRLOG, "$output_filestem.err");
1034        while (<ERRLOG>) {print FAILLOG $_;}
1035        close ERRLOG;
1036        close FAILLOG;
1037       }   
1038        &FileUtils::removeFiles("$output_filestem.err");
1039    }
1040    return 0;
1041    }
1042    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1043    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1044    return 1;
1045}
1046
1047# Convert a PDF file to text with xpdftools' pdftotext command
1048# Works for Windows too, whereas the old pdftotxt didn't
1049sub xpdf_to_text {
1050    my ($dirname, $input_filename, $output_filestem) = @_;
1051
1052    my $cmd = "";
1053
1054    # build up the path to the doc-to-txt conversion tool we're going to use
1055    my $xpdf_pdftotxt = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftotext");
1056   
1057    # For xpdf's pdftotxt options, see https://www.xpdfreader.com/pdftotext-man.html
1058    $cmd .= "\"$xpdf_pdftotxt\"";
1059    if($enc) {
1060        $cmd .= " -enc $enc"; # decode the bytes in the file using the designated encoding scheme
1061    } else {
1062        # as per https://www.xpdfreader.com/pdftotext-man.html
1063        # xpdf's pdftotxt defaults to using Latin-1 encoding, should we default to UTF-8?
1064        $cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html
1065    }
1066    $cmd .= " -nopgbrk";
1067    # Avoid the silly solitary carriage returns (CR in Notepad) at the end
1068    # of lines that ends up as \n appended to the doc title
1069    # by setting the end of line marker to unix style solitary newline (LF or \n),
1070    # which doesn't end up in the doc title
1071    $cmd .= " -eol unix";
1072    $cmd .= " \"$input_filename\" \"$output_filestem.text\"";   
1073
1074    print STDERR "@@@@ Running command: $cmd\n";
1075   
1076    return _run_pdf_to_text_cmd($cmd, $output_filestem);
1077}
1078
1079# Convert a PDF file to text with the pdftotext command
1080
1081sub pdf_to_text {
1082    my ($dirname, $input_filename, $output_filestem) = @_;
1083
1084    my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1085   
1086    return _run_pdf_to_text_cmd($cmd, $output_filestem);
1087}
1088
1089sub _run_pdf_to_text_cmd {
1090    my ($cmd, $output_filestem) = @_;
1091
1092    if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1093    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1094    } else {
1095    $cmd .= " > \"$output_filestem.err\"";
1096    }
1097   
1098    if (system($cmd)!=0)
1099    {
1100    print STDERR "Error executing $cmd: $!\n";
1101    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1102    }
1103
1104    # make sure there is some extracted text.
1105    if (-e "$output_filestem.text") {
1106    open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
1107    binmode(EXTR_TEXT); # just in case...
1108    my $line="";
1109    my $seen_text=0;
1110    while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1111        if ($line=~ m/\w/) {$seen_text=1;}
1112    }
1113    close EXTR_TEXT;
1114    if ($seen_text==0) { # no text was extracted
1115        print STDERR "Error: pdftotext found no text\n";
1116        &FileUtils::removeFiles("$output_filestem.text");
1117    }
1118    }
1119
1120    # make sure the converter made something
1121    if (! -s "$output_filestem.text")
1122    {
1123    # print out the converters std err, if any
1124    if (-s "$output_filestem.err") {
1125        open (ERRLOG, "$output_filestem.err") || die "$!";
1126        print STDERR "pdftotext error log:\n";
1127        while (<ERRLOG>) {
1128        print STDERR "$_";
1129        }
1130        close ERRLOG;
1131    }
1132    # does this converter create a .out file?
1133    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1134    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1135    if (-e "$output_filestem.err") {
1136        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1137        {
1138        open (ERRLOG,"$output_filestem.err");
1139        while (<ERRLOG>) {print FAILLOG $_;}
1140        close ERRLOG;
1141        close FAILLOG;
1142        }
1143        &FileUtils::removeFiles("$output_filestem.err");
1144    }
1145    return 0;
1146    }
1147    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1148    return 1;
1149}
1150
1151# Convert a PostScript document to text
1152# note - just using "ps2ascii" isn't good enough, as it
1153# returns 0 for a postscript interpreter error. ps2ascii is just
1154# a wrapper to "gs" anyway, so we use that cmd here.
1155
1156sub ps_to_text {
1157    my ($input_filename, $output_filestem) = @_;
1158
1159    my $error = "";
1160
1161    # if we're on windows we'll fall straight through without attempting
1162    # to use gs
1163    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1164    $error = "Windows does not support gs";
1165
1166    } else {
1167    my $cmd = "";
1168    if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1169    $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1170    $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1171    #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1172    $cmd .= " 2> $output_filestem.err";
1173    $!=0;
1174
1175    my $retcode=system($cmd);
1176    $retcode = $? >> 8;  # see man perlfunc - system for this...
1177    # if system returns -1 | 127 (couldn't start program), look at $! for message
1178
1179    if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1180    elsif (! -e "$output_filestem.text") {
1181        $error="did not create output file.\n";
1182    }
1183    else
1184    {   # make sure the interpreter didn't get an error. It is technically
1185        # possible for the actual text to start with this, but....
1186        open PSOUT, "$output_filestem.text";
1187        if (<PSOUT> =~ m/^Error: (.*)/) {
1188        $error="interpreter error - \"$1\"";
1189        }
1190        close PSOUT;
1191    }
1192    }
1193
1194    if ($error ne "")
1195    {
1196    print STDERR "Warning: Error executing gs: $error\n";
1197    print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
1198    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1199
1200    if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1201    {
1202        print FAILLOG "gs - $error\n";
1203        if (-e "$output_filestem.err") {
1204        open(ERRLOG, "$output_filestem.err");
1205        while (<ERRLOG>) {print FAILLOG $_;}
1206        close ERRLOG;
1207        }
1208        close FAILLOG;
1209    }
1210    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1211
1212
1213    # Fine then. We'll just do a lousy job by ourselves...
1214    # Based on 5-line regexp sed script found at:
1215    # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1216    #
1217    print STDERR "Stripping text from postscript\n";
1218    my $errorcode=0;
1219    open (IN, "$input_filename")
1220        ||  ($errorcode=1, warn "Couldn't read file: $!");
1221    open (OUT, ">$output_filestem.text")
1222        ||  ($errorcode=1, warn "Couldn't write file: $!");
1223    if ($errorcode) {print STDERR "errors\n";return 0;}
1224   
1225    my $text="";  # this is for whole .ps file...
1226    $text = join('', <IN>); # see man perlport, under "System Resources"
1227    close IN;
1228
1229    # Make sure this is a ps file...
1230    if ($text !~ m/^%!/) {
1231        print STDERR "Bad postscript header: not '%!'\n";
1232        if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1233        {
1234        print FAILLOG "Bad postscript header: not '%!'\n";
1235        close FAILLOG;
1236        }
1237        return 0;
1238    }
1239
1240    # if ps has Page data, then use it to delete all stuff before it.
1241    $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1242   
1243    # remove all leading non-data stuff
1244    $text =~ s/^.*?\(//s;
1245
1246    # remove all newline chars for easier processing
1247    $text =~ s/\n//g;
1248   
1249    # Big assumption here - assume that if any co-ordinates are
1250    # given, then we are at the end of a sentence.
1251    $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1252
1253    # special characters--
1254    $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1255
1256    # ? ps text formatting (eg italics?) ?
1257    $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1258    $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1259    $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1260    # default - remove the rest
1261    $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1262
1263    # attempt to add whitespace between words...
1264    # this is based purely on observation, and may be completely wrong...
1265    $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1266    # eg I notice "b(" is sometimes NOT a space if preceded by a
1267    # negative number.
1268    $text =~ s/\)\d+ ?b\(/\) \( /g;
1269
1270    # change quoted braces to brackets
1271    $text =~ s/([^\\])\\\(/$1\{/g;
1272    $text =~ s/([^\\])\\\)/$1\}/g ;
1273
1274    # remove everything that is not between braces
1275    $text =~ s/\)([^\(\)])+?\(//sg ;
1276   
1277    # remove any Trailer eof stuff.
1278    $text =~ s/\)[^\)]*$//sg;
1279
1280    ### ligatures have special characters...
1281    $text =~ s/\\013/ff/g;
1282    $text =~ s/\\014/fi/g;
1283    $text =~ s/\\015/fl/g;
1284    $text =~ s/\\016/ffi/g;
1285    $text =~ s/\\214/fi/g;
1286    $text =~ s/\\215/fl/g;
1287    $text =~ s/\\017/\n\* /g; # asterisk?
1288    $text =~ s/\\023/\023/g;  # e acute ('e)
1289    $text =~ s/\\177/\252/g;  # u"
1290#   $text =~ s/ ?? /\344/g;  # a"
1291
1292    print OUT "$text";
1293    close OUT;
1294    }
1295    # wrap the text - use a minimum length. ie, first space after this length.
1296    my $wrap_length=72;
1297    &FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1298    open INFILE, "$output_filestem.text.tmp" ||
1299    die "Couldn't open file: $!";
1300    open OUTFILE, ">$output_filestem.text" ||
1301    die "Couldn't open file for writing: $!";
1302    my $line="";
1303    while ($line=<INFILE>) {
1304    while (length($line)>0) {
1305        if (length($line)>$wrap_length) {
1306        $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1307        print OUTFILE "$1\n";
1308        } else {
1309        print OUTFILE "$line";
1310        $line="";
1311        }
1312    }
1313    }
1314    close INFILE;
1315    close OUTFILE;
1316    &FileUtils::removeFiles("$output_filestem.text.tmp");
1317
1318    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1319    return 1;
1320}
1321
1322
1323# Convert any file to HTML with a crude perl implementation of the
1324# UNIX strings command.
1325
1326sub any_to_html {
1327    my ($input_filename, $output_filestem) = @_;
1328
1329    # First generate a text file
1330    return 0 unless (&any_to_text($input_filename, $output_filestem));
1331
1332    # create an HTML file from the text file
1333    open(TEXT, "<$output_filestem.text");
1334    open(HTML, ">$output_filestem.html");
1335
1336    print HTML "<html><head>\n";
1337    print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1338    print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1339    print HTML "</head><body>\n\n";
1340
1341    my $line;
1342    while ($line=<TEXT>) {
1343    $line =~ s/</&lt;/g;
1344    $line =~ s/>/&gt;/g;
1345    if ($line =~ m/^\s*$/) {
1346        print HTML "<p>";
1347    } else {
1348        print HTML "<br> ", $line;
1349    }
1350    }
1351    print HTML "\n</body></html>\n";
1352
1353    close HTML;
1354    close TEXT;
1355
1356    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1357    return 1;
1358}
1359
1360# Convert any file to TEXT with a crude perl implementation of the
1361# UNIX strings command.
1362# Note - this assumes ascii charsets :(     (jrm21)
1363
1364sub any_to_text {
1365    my ($input_filename, $output_filestem) = @_;
1366
1367    if (!$use_strings) {
1368      return 0;
1369    }
1370
1371    print STDERR "\n**** In any to text****\n\n";
1372    open(IN, "<$input_filename") || return 0;
1373    binmode(IN);
1374    open(OUT, ">$output_filestem.text") || return 0;
1375
1376    my ($line);
1377    my $output_line_count = 0;
1378    while (<IN>) {
1379    $line = $_;
1380
1381    # delete anything that isn't a printable character
1382    $line =~ s/[^\040-\176]+/\n/sg;
1383
1384    # delete any string less than 10 characters long
1385    $line =~ s/^.{0,9}$/\n/mg;
1386    while ($line =~ m/^.{1,9}$/m) {
1387        $line =~ s/^.{0,9}$/\n/mg;
1388        $line =~ s/\n+/\n/sg;
1389    }
1390
1391    # remove extraneous whitespace
1392    $line =~ s/\n+/\n/gs;
1393    $line =~ s/^\n//gs;
1394
1395    # output whatever is left
1396    if ($line =~ m/[^\n ]/) {
1397        print OUT $line;
1398        ++$output_line_count;
1399    }
1400    }
1401
1402    close OUT;
1403    close IN;
1404
1405    if ($output_line_count) { # try to protect against binary only formats
1406    return 1;
1407    }
1408
1409    &FileUtils::removeFiles("$output_filestem.text");
1410    return 0;
1411
1412}
Note: See TracBrowser for help on using the browser.