root/main/trunk/greenstone2/bin/script/gsConvert.pl @ 32226

Revision 32226, 43.7 KB (checked in by ak19, 2 years ago)

Making xpdf_to_text, which uses xpdf-tools' pdftotext, the pdf to txt conversion tool for linux and mac as well. Recently used it for windows which had no prior PDF to txt conversion tool and used to output HTML. Since the introduction into GS of xpdf-tools, we can support newer pdf versions so using its pdftotxt as default tool to do PDF to txt conversions seems to be the way forward.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs.  The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use strict;
52
53use parsargv;
54use util;
55use FileUtils;
56use Cwd;
57
58# Are we running on WinNT or Win2000 (or later)?
59my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62my $use_strings;
63my $pdf_complex;
64my $pdf_nohidden;
65my $pdf_zoom;
66my $pdf_ignore_images;
67my $pdf_allow_images_only;
68my $windows_scripting;
69my $enc;
70
71sub print_usage
72{
73    print STDERR "\n";
74    print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
75    print STDERR "              or text using third-party programs.\n\n";
76    print STDERR "  usage: $0 [options] filename\n";
77    print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
78    print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
79    print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
80    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
81    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
82    print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
83    print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
84    print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
85    print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
86    print STDERR "\t\tconverting PDF to HTML\n";
87    print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
88    print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
89    print STDERR "\t\t-pdf_complex is set\n";
90    exit(1);
91}
92
93my $faillogfile="";
94my $timeout=0;
95my $verbosity=0;
96
97sub main
98{
99    my (@ARGV) = @_;
100    my ($input_type,$output_type,$verbose);
101
102    # Dynamically figure out what the --type option can support, based on whether -windows_scripting
103    # is in use or not
104    my $default_type_re = "(doc|dot|pdf|ps|ppt|rtf|xls)";
105    #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
106    #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
107    # Currently only have VBA for Word and PPT(but no XLS)
108    my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xls)";
109
110    my $type_re = $default_type_re;
111   
112    foreach my $a (@ARGV) {
113        if ($a =~ m/^windows_scripting$/i) {
114            $type_re = $enhanced_type_re;
115        }
116    }
117   
118    # read command-line arguments
119    if (!parsargv::parse(\@ARGV,
120             "type/$type_re/", \$input_type,
121             '/errlog/.*/', \$faillogfile,
122             'output/(auto|html|text|pagedimg).*/', \$output_type,
123             'timeout/\d+/0',\$timeout,
124             'verbose/\d+/0', \$verbose,
125             'windows_scripting',\$windows_scripting,
126             'use_strings', \$use_strings,
127             'pdf_complex', \$pdf_complex,
128             'pdf_ignore_images', \$pdf_ignore_images,
129             'pdf_allow_images_only', \$pdf_allow_images_only,
130             'pdf_nohidden', \$pdf_nohidden,
131             'pdf_zoom/\d+/2', \$pdf_zoom
132             ))
133    {
134    print_usage();
135    }
136
137    $verbosity=$verbose if defined $verbose;
138     
139    # Make sure the input file exists and can be opened for reading
140    if (scalar(@ARGV!=1)) {
141    print_usage();
142    }
143
144    my $input_filename = $ARGV[0];
145    if (!-r $input_filename) {
146    print STDERR "Error: unable to open $input_filename for reading\n";
147    exit(1);
148    }
149
150    # Deduce filenames
151    my ($tailname,$dirname,$suffix)
152    = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
153    my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
154
155    if ($input_type eq "")
156    {
157    $input_type = lc (substr($suffix,1,length($suffix)-1));
158    }
159   
160    # Change to temporary working directory
161    my $stored_dir = cwd();
162    chdir ($dirname) || die "Unable to change to directory $dirname";
163
164    # Select convert utility
165    if (!defined $input_type) {
166    print STDERR "Error: No filename extension or input type defined\n";
167    exit(1);
168    }
169    elsif ($input_type =~ m/^docx?$/ || $input_type eq "dot") {
170    print &convertDOC($input_filename, $output_filestem, $output_type);
171    print "\n";
172    }
173    elsif ($input_type eq "rtf") {
174    print &convertRTF($input_filename, $output_filestem, $output_type);
175    print "\n";
176    }
177    elsif ($input_type eq "pdf") {
178    print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
179    print "\n";
180    }
181    elsif ($input_type eq "ps") {
182    print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
183    print "\n";
184    }
185    elsif ($input_type =~ m/pptx?$/) {
186    print &convertPPT($input_filename, $output_filestem, $output_type);
187    print "\n";
188    }
189    elsif ($input_type =~ m/xlsx?$/) {
190    print &convertXLS($input_filename, $output_filestem, $output_type);
191    print "\n";
192    }
193    else {
194    print STDERR "Error: Unable to convert type '$input_type'\n";
195    exit(1);
196    }
197   
198    # restore to original working directory
199    chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
200
201}
202
203&main(@ARGV);
204
205
206
207# Document-type conversion functions
208#
209# The following functions attempt to convert documents from their
210# input type to the specified output type.  If no output type was
211# given, then they first attempt HTML, and then TEXT.
212#
213# Each returns the output type ("html" or "text") or "fail" if no
214# conversion is possible.
215
216# Convert a Microsoft word document
217
218sub convertDOC {
219    my ($input_filename, $output_filestem, $output_type) = @_;
220
221    # Many .doc files are not in fact word documents!
222    my $realtype = &find_docfile_type($input_filename);
223
224    if ($realtype eq "word6" || $realtype eq "word7"
225        || $realtype eq "word8" || $realtype eq "docx") {
226    return &convertWord678($input_filename, $output_filestem, $output_type);
227    } elsif ($realtype eq "rtf") {
228    return &convertRTF($input_filename, $output_filestem, $output_type);
229    } else {
230    return &convertAnything($input_filename, $output_filestem, $output_type);
231    }
232}
233
234# Convert a Microsoft word 6/7/8 document
235
236sub convertWord678 {
237    my ($input_filename, $output_filestem, $output_type) = @_;
238
239    my $success = 0;
240    if (!$output_type || ($output_type =~ m/html/i)){
241    if ($windows_scripting) {
242        $success = &native_doc_to_html($input_filename, $output_filestem);
243    }
244    else {
245        $success = &doc_to_html($input_filename, $output_filestem);   
246    }
247    if ($success) {
248       return "html";
249    }
250    }
251    return &convertAnything($input_filename, $output_filestem, $output_type);
252}
253
254
255# Convert a Rich Text Format (RTF) file
256
257sub convertRTF {
258    my ($input_filename, $output_filestem, $output_type) = @_;
259
260    my $success = 0;
261
262    # Attempt specialised conversion to HTML
263    if (!$output_type || ($output_type =~ m/html/i)) {
264
265    if ($windows_scripting) {
266        $success = &native_doc_to_html($input_filename, $output_filestem);
267    }
268    else {
269        $success = &rtf_to_html($input_filename, $output_filestem);
270    }
271    if ($success) {
272        return "html";
273    }
274    }
275
276# rtf is so ugly that's it's not worth running strings over.
277# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
278#    return &convertAnything($input_filename, $output_filestem, $output_type);
279    return "fail";
280}
281
282
283# Convert an unidentified file
284
285sub convertAnything {
286    my ($input_filename, $output_filestem, $output_type) = @_;
287   
288    my $success = 0;
289 
290    # Attempt simple conversion to HTML
291    if (!$output_type || ($output_type =~ m/html/i)) {
292    $success = &any_to_html($input_filename, $output_filestem);
293    if ($success) {
294        return "html";
295    }
296    }
297
298    # Convert to text
299    if (!$output_type || ($output_type =~ m/text/i)) {
300    $success = &any_to_text($input_filename, $output_filestem);
301    if ($success) {
302        return "text";
303    }
304    }
305    return "fail";
306}
307
308
309
310# Convert an Adobe PDF document
311
312sub convertPDF {
313    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
314
315    my $success = 0;
316    $output_type =~ s/.*\-(.*)/$1/i;
317    # Attempt coversion to Image
318    if ($output_type =~ m/jp?g|gif|png/i) {
319    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
320    if ($success){
321        return "item";
322    }
323    }
324
325    # Attempt conversion to HTML
326    # Uses the old pdftohtml that doesn't work for newer PDF versions
327    if ($output_type =~ m/^html/i) {
328    #if (!$output_type || ($output_type =~ m/^html/i)) {
329    $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
330    if ($success) {
331        return "html";
332    }
333    }
334
335    # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This
336    # will be the new default for PDFs when output_type for PDF docs is not specified
337    # (once our use of xpdftools' pdftohtml has been implemented on win and mac).
338    #if ($output_type =~ m/paged_html/i) {
339    if (!$output_type || ($output_type =~ m/paged_html/i)) {
340    $success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
341    if ($success) {
342        return "paged_html";
343    }
344    }
345
346    # Attempt conversion to TEXT
347    if (!$output_type || ($output_type =~ m/text/i)) {
348        $success = &xpdf_to_text($dirname, $input_filename, $output_filestem);
349        #if ($ENV{'GSDLOS'} =~ m/^windows$/i) { # we now have pdf to text support for windows by using xpdf tools
350        #   $success = &xpdf_to_text($dirname, $input_filename, $output_filestem);
351        #} else {
352        #   $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
353        #}
354    if ($success) {
355        return "text";
356    }
357    }
358
359    return "fail";
360
361}
362
363
364# Convert an Adobe PostScript document
365
366sub convertPS {
367    my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
368
369    my $success = 0;
370    $output_type =~ s/.*\-(.*)/$1/i;
371    # Attempt coversion to Image
372    if ($output_type =~ m/jp?g|gif|png/i) {
373    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
374    if ($success){
375        return "item";
376    }
377    }
378
379    # Attempt conversion to TEXT
380    if (!$output_type || ($output_type =~ m/text/i)) {
381    $success = &ps_to_text($input_filename, $output_filestem);
382    if ($success) {
383        return "text";
384    }
385    }
386    return "fail";
387}
388
389
390sub convertPPT {
391    my ($input_filename, $output_filestem, $output_type) = @_;
392    my $success = 0;
393
394    my $ppt_convert_type = "";
395
396    #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
397    if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
398    if ($output_type =~ m/gif/i) {
399        $ppt_convert_type = "-g";
400    } elsif ($output_type =~ m/jp?g/i){
401        $ppt_convert_type = "-j";
402    } elsif ($output_type =~ m/png/i){
403        $ppt_convert_type = "-p";
404    }
405    my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
406                       $ENV{'GSDLOS'}, "pptextract");
407    $vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
408    # $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
409           
410    my $cmd = "";
411    if ($timeout) {$cmd = "ulimit -t $timeout;";}
412    # if the converting directory already exists
413    if (-d $output_filestem) {
414        print STDERR "**The conversion directory already exists\n";
415        return "item";
416    } else {
417        $cmd .=  "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
418        $cmd .= " 2>\"$output_filestem.err\""
419        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
420
421        if (system($cmd) !=0) {
422        print STDERR "Powerpoint VB Scripting convert failed\n";
423        } else {
424        return "item";
425        }
426    }
427    } elsif (!$output_type || ($output_type =~ m/html/i)) {
428    # Attempt conversion to HTML
429    #if (!$output_type || ($output_type =~ m/html/i)) {
430    # formulate the command
431    my $cmd = "";
432    my $full_perl_path = &util::get_perl_exec();
433    $cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
434    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
435    $cmd .= " 2>\"$output_filestem.err\""
436        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
437
438    # execute the command
439    $!=0;
440    if (system($cmd)!=0)
441    {
442        print STDERR "Powerpoint 95/97 converter failed $!\n";
443    } else {
444        return "html";
445    }
446    }
447
448    $success = &any_to_text($input_filename, $output_filestem);
449    if ($success) {
450    return "text";
451    }
452   
453    return "fail";
454}
455
456
457sub convertXLS {
458    my ($input_filename, $output_filestem, $output_type) = @_;
459
460    my $success = 0;
461
462    # Attempt conversion to HTML
463    if (!$output_type || ($output_type =~ m/html/i)) {
464    # formulate the command
465    my $cmd = "";
466    my $full_perl_path = &util::get_perl_exec();
467    $cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
468    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
469    $cmd .= " 2>\"$output_filestem.err\""
470        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
471   
472   
473    # execute the command
474    $!=0;
475    if (system($cmd)!=0)
476    {
477        print STDERR "Excel 95/97 converter failed $!\n";
478    } else {
479        return "html";
480    }
481    }
482
483    $success = &any_to_text($input_filename, $output_filestem);
484    if ($success) {
485    return "text";
486    }
487
488    return "fail";
489}
490
491
492
493# Find the real type of a .doc file
494#
495# We seem to have a lot of files with a .doc extension that are .rtf
496# files or Word 5 files.  This function attempts to tell the difference.
497sub find_docfile_type {
498    my ($input_filename) = @_;
499   
500    if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
501        return "docx";
502    }
503   
504    open(CHK, "<$input_filename");
505    binmode(CHK);
506    my $line = "";
507    my $first = 1;
508
509    while (<CHK>) {
510   
511    $line = $_;
512
513    if ($first) {
514        # check to see if this is an rtf file
515        if ($line =~ m/^\{\\rtf/) {
516        close(CHK);
517        return "rtf";
518        }
519        $first = 0;
520    }
521   
522    # is this is a word 6/7/8 document?
523    if ($line =~ m/Word\.Document\.([678])/) {
524        close(CHK);
525
526        return "word$1";
527    }
528
529    }
530
531    return "unknown";
532}
533
534
535# Specific type-to-type conversions
536#
537# Each of the following functions attempts to convert a document from
538# a specific format to another.  If they succeed they return 1 and leave
539# the output document(s) in the appropriate place; if they fail they
540# return 0 and delete any working files.
541
542
543# Attempt to convert a word document to html with the wv program
544sub doc_to_html {
545    my ($input_filename, $output_filestem) = @_;
546
547    my $wvware_status = 0;
548   
549    # need to ensure that the path to perl is quoted (in case there's spaces in it)
550    my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";   
551
552#    print STDERR "***** wvware launch cmd = $launch_cmd\n";
553
554    $wvware_status = system($launch_cmd)/256;
555    return $wvware_status;
556}
557
558# Attempt to convert a word document to html with the word2html scripting program
559sub native_doc_to_html {
560    my ($input_filename, $output_filestem) = @_;
561
562    # build up the path to the doc-to-html conversion tool we're going to use
563    my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
564
565    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
566        # if windows scripting with docx input, use new VBscript to get the local Word install (if
567        # any) to do the conversion, since docX can't be processed by word2html's windows_scripting
568       
569        if($input_filename =~ m/docx$/i) {  # need to use full path to docx2html script,
570                                            # else script launch fails when there are error msgs
571            $vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
572            $vbScript = "CScript //Nologo \"$vbScript\"";   # launch with CScript for error output in STDERR
573                                    # //Nologo flag avoids Microsoft's opening/logo msgs
574            print STDERR "About to use windows scripting to process docx file $input_filename.\n";
575            print STDERR "   This may take some time. Please wait...\n";
576        }
577        else {  # old doc versions. use the usual VB executable word2html for the
578                # conversion. Doesn't need full path, since bin\windows is on PATH         
579            $vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
580        }
581    }
582    else { # not windows
583        $vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
584    }
585
586    if (-e "$output_filestem.html") {
587    print STDERR "    The conversion file:\n";
588    print STDERR "      $output_filestem.html\n";
589    print STDERR "    ... already exists.  Skipping\n";
590    return 1;
591    }
592
593    my $cmd = "";
594    if ($timeout) {$cmd = "ulimit -t $timeout;";}
595    #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
596    #$cmd .=  "$vbScript $input_filename $output_filestem.html";
597    $cmd .=  "$vbScript \"$input_filename\" \"$output_filestem.html\"";
598
599    # redirecting STDERR
600   
601    $cmd .= " 2> \"$output_filestem.err\""
602        if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);   
603    #print STDERR "@@@@@@@@@ cmd=$cmd\n";
604   
605    # execute the command
606    $!=0;
607    if (system($cmd)!=0)
608    {
609    print STDERR "Error executing $vbScript converter:$!\n";
610    if (-s "$output_filestem.err") {
611        open (ERRFILE, "<$output_filestem.err");
612       
613        my $write_to_fail_log=0;
614        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
615        {$write_to_fail_log=1;}
616
617        my $line;
618        while ($line=<ERRFILE>) {
619        if ($line =~ m/\w/) {
620            print STDERR "$line";
621            print FAILLOG "$line" if ($write_to_fail_log);
622        }
623        if ($line !~ m/startup error/) {next;}
624        print STDERR " (given an invalid .DOC file?)\n";
625        print FAILLOG " (given an invalid .DOC file?)\n"
626        if ($write_to_fail_log);
627       
628        } # while ERRFILE
629        close FAILLOG if ($write_to_fail_log);
630    }
631    return 0; # we can try any_to_text
632    }
633
634    # Was the conversion successful?
635    if (-s "$output_filestem.html") {
636    open(TMP, "$output_filestem.html");
637    my $line = <TMP>;
638    close(TMP);
639    if ($line && $line =~ m/html/i) {
640        &FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
641        return 1;
642    }
643    }
644   
645    # If here, an error of some sort occurred
646    &FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
647    if (-e "$output_filestem.err") {
648    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
649        open (ERRLOG,"$output_filestem.err");
650        while (<ERRLOG>) {print FAILLOG $_;}
651        close FAILLOG;
652        close ERRLOG;
653    }
654    &FileUtils::removeFiles("$output_filestem.err");
655    }
656    return 0;
657}
658
659# Attempt to convert an RTF document to html with rtftohtml
660sub rtf_to_html {
661    my ($input_filename, $output_filestem) = @_;
662
663    # formulate the command
664    my $cmd = "";
665    if ($timeout) {$cmd = "ulimit -t $timeout;";}
666    $cmd .= "rtftohtml";
667    #$cmd .= "rtf-converter";
668
669    $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
670
671    $cmd .= " 2>\"$output_filestem.err\""
672        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
673
674
675    # execute the command
676    $!=0;
677    if (system($cmd)!=0)
678    {
679    print STDERR "Error executing rtf converter $!\n";
680    # don't currently bother printing out error log...
681    # keep going, in case it still created an HTML file...
682    }
683
684    # Was the conversion successful?
685    my $was_successful=0;
686    if (-s "$output_filestem.html") {
687    # make sure we have some content other than header
688    open (HTML, "$output_filestem.html"); # what to do if fail?
689    my $line;
690    my $past_header=0;
691    while ($line=<HTML>) {
692
693        if ($past_header == 0) {
694        if ($line =~ m/<body>/) {$past_header=1;}
695        next;
696        }
697
698        $line =~ s/<[^>]+>//g;
699        if ($line =~ m/\w/ && $past_header) {  # we found some content...
700        $was_successful=1;
701        last;
702        }
703    }
704    close HTML;
705    }
706
707    if ($was_successful) {
708    &FileUtils::removeFiles("$output_filestem.err")
709        if (-e "$output_filestem.err");
710    # insert the (modified) table of contents, if it exists.
711    if (-e "${output_filestem}_ToC.html") {
712        &FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
713        my $open_failed=0;
714        open HTMLSRC, "$output_filestem.src" || ++$open_failed;
715        open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
716        open HTML, ">$output_filestem.html" || ++$open_failed;
717       
718        if ($open_failed) {
719        close HTMLSRC;
720        close TOC;
721        close HTML;
722        &FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
723        return 1;
724        }
725
726        # print out header info from src html.
727        while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
728        print HTML "$_";
729        }
730
731        # print out table of contents, making links relative
732        <TOC>; <TOC>; # ignore first 2 lines
733        print HTML scalar(<TOC>); # line 3 = "<ol>\n"
734        my $line;
735        while ($line=<TOC>) {
736        $line =~ s@</body></html>$@@i ; # only last line has this
737        # make link relative
738        $line =~ s@href=\"[^\#]+@href=\"@i;
739        print HTML $line;
740        }
741        close TOC;
742
743        # rest of html src
744        while (<HTMLSRC>) {
745        print HTML $_;
746        }
747        close HTMLSRC;
748        close HTML;
749
750        &FileUtils::removeFiles("${output_filestem}_ToC.html");
751        &FileUtils::removeFiles("${output_filestem}.src");
752    }
753    # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
754    return 1; # success
755    }
756
757    if (-e "$output_filestem.err") {
758    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
759    {
760        print FAILLOG "Error - rtftohtml - couldn't extract text\n";
761        #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
762        print FAILLOG " (rtf file might be too recent):\n";
763        open (ERRLOG, "$output_filestem.err");
764        while (<ERRLOG>) {print FAILLOG $_;}
765        close ERRLOG;
766        close FAILLOG;
767    }
768    &FileUtils::removeFiles("$output_filestem.err");
769    }
770
771    &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
772
773    return 0;
774}
775
776
777# Convert a pdf file to html with the old pdftohtml command
778# which only works for older PDF versions
779sub pdf_to_html {
780    my ($dirname, $input_filename, $output_filestem) = @_;
781
782    my $cmd = "";
783    if ($timeout) {$cmd = "ulimit -t $timeout;";}
784    my $full_perl_path = &util::get_perl_exec();
785    $cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
786    $cmd .= " -c" if ($pdf_complex);
787    $cmd .= " -i" if ($pdf_ignore_images);
788    $cmd .= " -a" if ($pdf_allow_images_only);
789    $cmd .= " -hidden" unless ($pdf_nohidden);
790    $cmd .= " \"$input_filename\" \"$output_filestem\"";
791   
792    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
793    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
794    } else {
795    $cmd .= " > \"$output_filestem.err\"";
796    }
797
798    $!=0;
799
800    my $retval=system($cmd);
801    if ($retval!=0)
802    {
803    print STDERR "Error executing pdftohtml.pl";
804    if ($!) {print STDERR ": $!";}
805    print STDERR "\n";
806    }
807
808    # make sure the converter made something
809    if ($retval!=0 || ! -s "$output_filestem.html")
810    {
811    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
812    # print out the converter's std err, if any
813    if (-s "$output_filestem.err") {
814        open (ERRLOG, "$output_filestem.err") || die "$!";
815        print STDERR "pdftohtml error log:\n";
816        while (<ERRLOG>) {
817        print STDERR "$_";
818        }
819        close ERRLOG;
820    }
821    #print STDERR "***********output filestem $output_filestem.html\n";
822    &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
823    if (-e "$output_filestem.err") {
824        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
825        {
826        open (ERRLOG, "$output_filestem.err");
827        while (<ERRLOG>) {print FAILLOG $_;}
828        close ERRLOG;
829        close FAILLOG;
830        }   
831        &FileUtils::removeFiles("$output_filestem.err");
832    }
833    return 0;
834    }
835
836    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
837    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
838    return 1;
839}
840
841
842# Convert a pdf file to html with the newer Xpdftools' pdftohtml
843# This generates "paged HTML" where extracted, selectable text is positioned
844# over screenshots of each page.
845# Since xpdf's pdftohtml fails if the output dir already exists and for easier
846# naming, the output files are created in a "pages" subdirectory of the tmp
847# location parent of $output_filestem instead
848sub xpdf_to_html {
849    my ($dirname, $input_filename, $output_filestem) = @_;
850
851    my $cmd = "";
852
853    # build up the path to the doc-to-html conversion tool we're going to use
854    my $xpdf_pdftohtml = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftohtml");
855   
856    # We'll create the file by name $output_filestem during post-conversion processing.
857    # Note that Xpdf tools will only create its conversion products in a dir that does
858    # not yet exist. So we'll create this location as a subdir of the output_filestem's
859    # parent directory. The parent dir is the already generated tmp area for conversion. So:
860    # - tmpdir gs2build/tmp/<random-num> already exists at this stage
861    # - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
862    # - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
863    my ($tailname, $tmp_dirname, $suffix)
864    = &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
865    $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
866   
867    # xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
868    $cmd .= "\"$xpdf_pdftohtml\"";
869    $cmd .= " -z $pdf_zoom" if ($pdf_zoom);
870#    $cmd .= " -c" if ($pdf_complex);
871#    $cmd .= " -i" if ($pdf_ignore_images);
872#    $cmd .= " -a" if ($pdf_allow_images_only);
873#    $cmd .= " -hidden" unless ($pdf_nohidden);   
874    $cmd .= " \"$input_filename\" \"$tmp_dirname\"";
875    #$cmd .= " \"$input_filename\" \"$output_filestem\"";
876
877    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
878    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
879    } else {
880    $cmd .= " > \"$output_filestem.err\"";
881    }
882
883    #print STDERR "@@@@ Running command: $cmd\n";
884
885    $!=0;
886    my $retval=system($cmd);
887    if ($retval!=0)
888    {
889    print STDERR "Error executing xpdf's pdftohtml tool";
890    if ($!) {print STDERR ": $!";}
891    print STDERR "\n";
892    }
893
894    # make sure the converter made something
895    if ($retval!=0 || ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
896    {
897    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
898    # print out the converter's std err, if any
899    if (-s "$output_filestem.err") {
900        open (ERRLOG, "$output_filestem.err") || die "$!";
901        print STDERR "pdftohtml error log:\n";
902        while (<ERRLOG>) {
903        print STDERR "$_";
904        }
905        close ERRLOG;
906    }
907    #print STDERR "***********output filestem $output_filestem.html\n";
908    &FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
909    if (-e "$output_filestem.err") {
910        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
911        {
912        open (ERRLOG, "$output_filestem.err");
913        while (<ERRLOG>) {print FAILLOG $_;}
914        close ERRLOG;
915        close FAILLOG;
916        }   
917        &FileUtils::removeFiles("$output_filestem.err");
918    }
919    return 0;
920    }
921
922    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
923    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
924    return 1;
925}
926
927# Returns the path to xpdf-tools's containing bin dir appropriate for this machine's OS and bitness
928sub _get_xpdftools_bindir {
929
930    # build up the path to the containing bin dir of the xpdf conversion tool we're going to use
931    my $xpdf_tools_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools");
932   
933    if($ENV{'GSDLOS'} =~ m/^windows$/i) { # For Windows, just use the 32 bit xpdf's pdftohtml as it works the same as the 64 bit
934        $xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin32");
935    } else { # unix (linux|darwin), use the bin32/bin64 folder depending on the BITNESS env var
936       
937        # Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since $ENV{'GSDLARCH'}
938        # isn't always set and has side-effects when it is set:
939        # $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit
940        # specific subdirectories exist in a greenstone installation.
941        # None of those locations need exist when xpdf-tools is installed with GS.
942        # So don't depend on GSDLARCH as forcing that to be exported has side-effects
943        if($ENV{'BITNESS'}) {
944            $xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin".$ENV{'BITNESS'});
945        } else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64?
946            $xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin32");
947        }
948    }
949   
950    return $xpdf_tools_bin;
951}
952
953# Convert a pdf file to various types of image with the convert command
954
955sub pdfps_to_img {
956    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
957
958    # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
959    if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
960    my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
961    $imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
962    my $result = `$imagick_cmd identify 2>&1`;
963
964    # Linux and Windows return different values for "program not found".
965    # Linux returns -1 and Windows 256 for "program not found". But once they're
966    # converted to signed values, it will be -1 for Linux and 1 for Windows.
967    # Whenever we test for return values other than 0, shift by 8 and perform
968    # unsigned to signed status conversion on $? to get expected range of return vals
969    # Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
970    # and then exits on that, by the time we get here, we need to do it again
971    my $status = $?;
972    $status >>= 8;
973    $status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);   
974    if (($ENV{'GSDLOS'} ne "windows" && $status == -1) || ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
975        # if ($status == -1 || $status == 1) #if ($status == -1 || $status == 256) {
976        #ImageMagick is not installed, thus the convert utility is not available.
977        print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
978        return 0;
979    }
980    }
981
982    my $cmd = "";
983    if ($timeout) {$cmd = "ulimit -t $timeout;";}
984    $output_type =~ s/.*\_(.*)/$1/i;
985    my $full_perl_path = &util::get_perl_exec();
986    $cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
987    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
988    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
989    } else {
990    $cmd .= " > \"$output_filestem.err\"";
991    }
992
993    # don't include path on windows (to avoid having to play about
994    # with quoting when GSDLHOME might contain spaces) but assume
995    # that the PATH is set up correctly
996    $!=0;
997    my $retval=system($cmd);
998    if ($retval!=0)
999    {
1000    print STDERR "Error executing pdfpstoimg.pl";
1001    if ($!) {print STDERR ": $!";}
1002    print STDERR "\n";
1003    }
1004
1005    #make sure the converter made something
1006    #if ($retval !=0) || ! -s "$output_filestem")
1007    if ($retval !=0)
1008    {
1009    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1010    #print out the converter's std err, if any
1011    if (-s "$output_filestem.err") {
1012        open (ERRLOG, "$output_filestem.err") || die "$!";
1013        print STDERR "pdfpstoimg error log:\n";
1014        while (<ERRLOG>) {
1015        print STDERR "$_";
1016        }
1017        close ERRLOG;
1018    }
1019    #&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
1020    if (-e "$output_filestem.err") {
1021        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1022        {
1023        open (ERRLOG, "$output_filestem.err");
1024        while (<ERRLOG>) {print FAILLOG $_;}
1025        close ERRLOG;
1026        close FAILLOG;
1027       }   
1028        &FileUtils::removeFiles("$output_filestem.err");
1029    }
1030    return 0;
1031    }
1032    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1033    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1034    return 1;
1035}
1036
1037# Convert a PDF file to text with xpdftools' pdftotext command
1038# Works for Windows too, whereas the old pdftotxt didn't
1039sub xpdf_to_text {
1040    my ($dirname, $input_filename, $output_filestem) = @_;
1041
1042    my $cmd = "";
1043
1044    # build up the path to the doc-to-txt conversion tool we're going to use
1045    my $xpdf_pdftotxt = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftotext");
1046   
1047    # For xpdf's pdftotxt options, see https://www.xpdfreader.com/pdftotext-man.html
1048    $cmd .= "\"$xpdf_pdftotxt\"";
1049    if($enc) {
1050        $cmd .= " -enc $enc"; # decode the bytes in the file using the designated encoding scheme
1051    } else {
1052        # as per https://www.xpdfreader.com/pdftotext-man.html
1053        # xpdf's pdftotxt defaults to using Latin-1 encoding, should we default to UTF-8?
1054        $cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html
1055    }
1056    $cmd .= " -nopgbrk";
1057    # Avoid the silly solitary carriage returns (CR in Notepad) at the end
1058    # of lines that ends up as \n appended to the doc title
1059    # by setting the end of line marker to unix style solitary newline (LF or \n),
1060    # which doesn't end up in the doc title
1061    $cmd .= " -eol unix";
1062    $cmd .= " \"$input_filename\" \"$output_filestem.text\"";   
1063
1064    print STDERR "@@@@ Running command: $cmd\n";
1065   
1066    return _run_pdf_to_text_cmd($cmd, $output_filestem);
1067}
1068
1069# Convert a PDF file to text with the pdftotext command
1070
1071sub pdf_to_text {
1072    my ($dirname, $input_filename, $output_filestem) = @_;
1073
1074    my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1075   
1076    return _run_pdf_to_text_cmd($cmd, $output_filestem);
1077}
1078
1079sub _run_pdf_to_text_cmd {
1080    my ($cmd, $output_filestem) = @_;
1081
1082    if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1083    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1084    } else {
1085    $cmd .= " > \"$output_filestem.err\"";
1086    }
1087   
1088    if (system($cmd)!=0)
1089    {
1090    print STDERR "Error executing $cmd: $!\n";
1091    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1092    }
1093
1094    # make sure there is some extracted text.
1095    if (-e "$output_filestem.text") {
1096    open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
1097    binmode(EXTR_TEXT); # just in case...
1098    my $line="";
1099    my $seen_text=0;
1100    while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1101        if ($line=~ m/\w/) {$seen_text=1;}
1102    }
1103    close EXTR_TEXT;
1104    if ($seen_text==0) { # no text was extracted
1105        print STDERR "Error: pdftotext found no text\n";
1106        &FileUtils::removeFiles("$output_filestem.text");
1107    }
1108    }
1109
1110    # make sure the converter made something
1111    if (! -s "$output_filestem.text")
1112    {
1113    # print out the converters std err, if any
1114    if (-s "$output_filestem.err") {
1115        open (ERRLOG, "$output_filestem.err") || die "$!";
1116        print STDERR "pdftotext error log:\n";
1117        while (<ERRLOG>) {
1118        print STDERR "$_";
1119        }
1120        close ERRLOG;
1121    }
1122    # does this converter create a .out file?
1123    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1124    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1125    if (-e "$output_filestem.err") {
1126        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1127        {
1128        open (ERRLOG,"$output_filestem.err");
1129        while (<ERRLOG>) {print FAILLOG $_;}
1130        close ERRLOG;
1131        close FAILLOG;
1132        }
1133        &FileUtils::removeFiles("$output_filestem.err");
1134    }
1135    return 0;
1136    }
1137    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1138    return 1;
1139}
1140
1141# Convert a PostScript document to text
1142# note - just using "ps2ascii" isn't good enough, as it
1143# returns 0 for a postscript interpreter error. ps2ascii is just
1144# a wrapper to "gs" anyway, so we use that cmd here.
1145
1146sub ps_to_text {
1147    my ($input_filename, $output_filestem) = @_;
1148
1149    my $error = "";
1150
1151    # if we're on windows we'll fall straight through without attempting
1152    # to use gs
1153    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1154    $error = "Windows does not support gs";
1155
1156    } else {
1157    my $cmd = "";
1158    if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1159    $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1160    $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1161    #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1162    $cmd .= " 2> $output_filestem.err";
1163    $!=0;
1164
1165    my $retcode=system($cmd);
1166    $retcode = $? >> 8;  # see man perlfunc - system for this...
1167    # if system returns -1 | 127 (couldn't start program), look at $! for message
1168
1169    if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1170    elsif (! -e "$output_filestem.text") {
1171        $error="did not create output file.\n";
1172    }
1173    else
1174    {   # make sure the interpreter didn't get an error. It is technically
1175        # possible for the actual text to start with this, but....
1176        open PSOUT, "$output_filestem.text";
1177        if (<PSOUT> =~ m/^Error: (.*)/) {
1178        $error="interpreter error - \"$1\"";
1179        }
1180        close PSOUT;
1181    }
1182    }
1183
1184    if ($error ne "")
1185    {
1186    print STDERR "Warning: Error executing gs: $error\n";
1187    print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
1188    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1189
1190    if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1191    {
1192        print FAILLOG "gs - $error\n";
1193        if (-e "$output_filestem.err") {
1194        open(ERRLOG, "$output_filestem.err");
1195        while (<ERRLOG>) {print FAILLOG $_;}
1196        close ERRLOG;
1197        }
1198        close FAILLOG;
1199    }
1200    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1201
1202
1203    # Fine then. We'll just do a lousy job by ourselves...
1204    # Based on 5-line regexp sed script found at:
1205    # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1206    #
1207    print STDERR "Stripping text from postscript\n";
1208    my $errorcode=0;
1209    open (IN, "$input_filename")
1210        ||  ($errorcode=1, warn "Couldn't read file: $!");
1211    open (OUT, ">$output_filestem.text")
1212        ||  ($errorcode=1, warn "Couldn't write file: $!");
1213    if ($errorcode) {print STDERR "errors\n";return 0;}
1214   
1215    my $text="";  # this is for whole .ps file...
1216    $text = join('', <IN>); # see man perlport, under "System Resources"
1217    close IN;
1218
1219    # Make sure this is a ps file...
1220    if ($text !~ m/^%!/) {
1221        print STDERR "Bad postscript header: not '%!'\n";
1222        if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1223        {
1224        print FAILLOG "Bad postscript header: not '%!'\n";
1225        close FAILLOG;
1226        }
1227        return 0;
1228    }
1229
1230    # if ps has Page data, then use it to delete all stuff before it.
1231    $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1232   
1233    # remove all leading non-data stuff
1234    $text =~ s/^.*?\(//s;
1235
1236    # remove all newline chars for easier processing
1237    $text =~ s/\n//g;
1238   
1239    # Big assumption here - assume that if any co-ordinates are
1240    # given, then we are at the end of a sentence.
1241    $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1242
1243    # special characters--
1244    $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1245
1246    # ? ps text formatting (eg italics?) ?
1247    $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1248    $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1249    $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1250    # default - remove the rest
1251    $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1252
1253    # attempt to add whitespace between words...
1254    # this is based purely on observation, and may be completely wrong...
1255    $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1256    # eg I notice "b(" is sometimes NOT a space if preceded by a
1257    # negative number.
1258    $text =~ s/\)\d+ ?b\(/\) \( /g;
1259
1260    # change quoted braces to brackets
1261    $text =~ s/([^\\])\\\(/$1\{/g;
1262    $text =~ s/([^\\])\\\)/$1\}/g ;
1263
1264    # remove everything that is not between braces
1265    $text =~ s/\)([^\(\)])+?\(//sg ;
1266   
1267    # remove any Trailer eof stuff.
1268    $text =~ s/\)[^\)]*$//sg;
1269
1270    ### ligatures have special characters...
1271    $text =~ s/\\013/ff/g;
1272    $text =~ s/\\014/fi/g;
1273    $text =~ s/\\015/fl/g;
1274    $text =~ s/\\016/ffi/g;
1275    $text =~ s/\\214/fi/g;
1276    $text =~ s/\\215/fl/g;
1277    $text =~ s/\\017/\n\* /g; # asterisk?
1278    $text =~ s/\\023/\023/g;  # e acute ('e)
1279    $text =~ s/\\177/\252/g;  # u"
1280#   $text =~ s/ ?? /\344/g;  # a"
1281
1282    print OUT "$text";
1283    close OUT;
1284    }
1285    # wrap the text - use a minimum length. ie, first space after this length.
1286    my $wrap_length=72;
1287    &FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1288    open INFILE, "$output_filestem.text.tmp" ||
1289    die "Couldn't open file: $!";
1290    open OUTFILE, ">$output_filestem.text" ||
1291    die "Couldn't open file for writing: $!";
1292    my $line="";
1293    while ($line=<INFILE>) {
1294    while (length($line)>0) {
1295        if (length($line)>$wrap_length) {
1296        $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1297        print OUTFILE "$1\n";
1298        } else {
1299        print OUTFILE "$line";
1300        $line="";
1301        }
1302    }
1303    }
1304    close INFILE;
1305    close OUTFILE;
1306    &FileUtils::removeFiles("$output_filestem.text.tmp");
1307
1308    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1309    return 1;
1310}
1311
1312
1313# Convert any file to HTML with a crude perl implementation of the
1314# UNIX strings command.
1315
1316sub any_to_html {
1317    my ($input_filename, $output_filestem) = @_;
1318
1319    # First generate a text file
1320    return 0 unless (&any_to_text($input_filename, $output_filestem));
1321
1322    # create an HTML file from the text file
1323    open(TEXT, "<$output_filestem.text");
1324    open(HTML, ">$output_filestem.html");
1325
1326    print HTML "<html><head>\n";
1327    print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1328    print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1329    print HTML "</head><body>\n\n";
1330
1331    my $line;
1332    while ($line=<TEXT>) {
1333    $line =~ s/</&lt;/g;
1334    $line =~ s/>/&gt;/g;
1335    if ($line =~ m/^\s*$/) {
1336        print HTML "<p>";
1337    } else {
1338        print HTML "<br> ", $line;
1339    }
1340    }
1341    print HTML "\n</body></html>\n";
1342
1343    close HTML;
1344    close TEXT;
1345
1346    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1347    return 1;
1348}
1349
1350# Convert any file to TEXT with a crude perl implementation of the
1351# UNIX strings command.
1352# Note - this assumes ascii charsets :(     (jrm21)
1353
1354sub any_to_text {
1355    my ($input_filename, $output_filestem) = @_;
1356
1357    if (!$use_strings) {
1358      return 0;
1359    }
1360
1361    print STDERR "\n**** In any to text****\n\n";
1362    open(IN, "<$input_filename") || return 0;
1363    binmode(IN);
1364    open(OUT, ">$output_filestem.text") || return 0;
1365
1366    my ($line);
1367    my $output_line_count = 0;
1368    while (<IN>) {
1369    $line = $_;
1370
1371    # delete anything that isn't a printable character
1372    $line =~ s/[^\040-\176]+/\n/sg;
1373
1374    # delete any string less than 10 characters long
1375    $line =~ s/^.{0,9}$/\n/mg;
1376    while ($line =~ m/^.{1,9}$/m) {
1377        $line =~ s/^.{0,9}$/\n/mg;
1378        $line =~ s/\n+/\n/sg;
1379    }
1380
1381    # remove extraneous whitespace
1382    $line =~ s/\n+/\n/gs;
1383    $line =~ s/^\n//gs;
1384
1385    # output whatever is left
1386    if ($line =~ m/[^\n ]/) {
1387        print OUT $line;
1388        ++$output_line_count;
1389    }
1390    }
1391
1392    close OUT;
1393    close IN;
1394
1395    if ($output_line_count) { # try to protect against binary only formats
1396    return 1;
1397    }
1398
1399    &FileUtils::removeFiles("$output_filestem.text");
1400    return 0;
1401
1402}
Note: See TracBrowser for help on using the browser.