root/main/trunk/greenstone2/bin/script/gsConvert.pl @ 32223

Revision 32223, 41.7 KB (checked in by ak19, 17 months ago)

When no output mode for PDFPlugin has been set by the user, the output mode now defaults to paged_html (previously html). paged_html uses xpdftools to do the PDF conversion, which will apparently work for all versions of PDF so it gives better version coverage than the old pdftohtml.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs.  The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use strict;
52
53use parsargv;
54use util;
55use FileUtils;
56use Cwd;
57
58# Are we running on WinNT or Win2000 (or later)?
59my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62my $use_strings;
63my $pdf_complex;
64my $pdf_nohidden;
65my $pdf_zoom;
66my $pdf_ignore_images;
67my $pdf_allow_images_only;
68my $windows_scripting;
69
70sub print_usage
71{
72    print STDERR "\n";
73    print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
74    print STDERR "              or text using third-party programs.\n\n";
75    print STDERR "  usage: $0 [options] filename\n";
76    print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
77    print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
78    print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
79    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
80    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
81    print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
82    print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
83    print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
84    print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
85    print STDERR "\t\tconverting PDF to HTML\n";
86    print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
87    print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
88    print STDERR "\t\t-pdf_complex is set\n";
89    exit(1);
90}
91
92my $faillogfile="";
93my $timeout=0;
94my $verbosity=0;
95
96sub main
97{
98    my (@ARGV) = @_;
99    my ($input_type,$output_type,$verbose);
100
101    # Dynamically figure out what the --type option can support, based on whether -windows_scripting
102    # is in use or not
103    my $default_type_re = "(doc|dot|pdf|ps|ppt|rtf|xls)";
104    #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
105    #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
106    # Currently only have VBA for Word and PPT(but no XLS)
107    my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xls)";
108
109    my $type_re = $default_type_re;
110   
111    foreach my $a (@ARGV) {
112        if ($a =~ m/^windows_scripting$/i) {
113            $type_re = $enhanced_type_re;
114        }
115    }
116   
117    # read command-line arguments
118    if (!parsargv::parse(\@ARGV,
119             "type/$type_re/", \$input_type,
120             '/errlog/.*/', \$faillogfile,
121             'output/(auto|html|text|pagedimg).*/', \$output_type,
122             'timeout/\d+/0',\$timeout,
123             'verbose/\d+/0', \$verbose,
124             'windows_scripting',\$windows_scripting,
125             'use_strings', \$use_strings,
126             'pdf_complex', \$pdf_complex,
127             'pdf_ignore_images', \$pdf_ignore_images,
128             'pdf_allow_images_only', \$pdf_allow_images_only,
129             'pdf_nohidden', \$pdf_nohidden,
130             'pdf_zoom/\d+/2', \$pdf_zoom
131             ))
132    {
133    print_usage();
134    }
135
136    $verbosity=$verbose if defined $verbose;
137     
138    # Make sure the input file exists and can be opened for reading
139    if (scalar(@ARGV!=1)) {
140    print_usage();
141    }
142
143    my $input_filename = $ARGV[0];
144    if (!-r $input_filename) {
145    print STDERR "Error: unable to open $input_filename for reading\n";
146    exit(1);
147    }
148
149    # Deduce filenames
150    my ($tailname,$dirname,$suffix)
151    = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
152    my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
153
154    if ($input_type eq "")
155    {
156    $input_type = lc (substr($suffix,1,length($suffix)-1));
157    }
158   
159    # Change to temporary working directory
160    my $stored_dir = cwd();
161    chdir ($dirname) || die "Unable to change to directory $dirname";
162
163    # Select convert utility
164    if (!defined $input_type) {
165    print STDERR "Error: No filename extension or input type defined\n";
166    exit(1);
167    }
168    elsif ($input_type =~ m/^docx?$/ || $input_type eq "dot") {
169    print &convertDOC($input_filename, $output_filestem, $output_type);
170    print "\n";
171    }
172    elsif ($input_type eq "rtf") {
173    print &convertRTF($input_filename, $output_filestem, $output_type);
174    print "\n";
175    }
176    elsif ($input_type eq "pdf") {
177    print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
178    print "\n";
179    }
180    elsif ($input_type eq "ps") {
181    print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
182    print "\n";
183    }
184    elsif ($input_type =~ m/pptx?$/) {
185    print &convertPPT($input_filename, $output_filestem, $output_type);
186    print "\n";
187    }
188    elsif ($input_type =~ m/xlsx?$/) {
189    print &convertXLS($input_filename, $output_filestem, $output_type);
190    print "\n";
191    }
192    else {
193    print STDERR "Error: Unable to convert type '$input_type'\n";
194    exit(1);
195    }
196   
197    # restore to original working directory
198    chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
199
200}
201
202&main(@ARGV);
203
204
205
206# Document-type conversion functions
207#
208# The following functions attempt to convert documents from their
209# input type to the specified output type.  If no output type was
210# given, then they first attempt HTML, and then TEXT.
211#
212# Each returns the output type ("html" or "text") or "fail" if no
213# conversion is possible.
214
215# Convert a Microsoft word document
216
217sub convertDOC {
218    my ($input_filename, $output_filestem, $output_type) = @_;
219
220    # Many .doc files are not in fact word documents!
221    my $realtype = &find_docfile_type($input_filename);
222
223    if ($realtype eq "word6" || $realtype eq "word7"
224        || $realtype eq "word8" || $realtype eq "docx") {
225    return &convertWord678($input_filename, $output_filestem, $output_type);
226    } elsif ($realtype eq "rtf") {
227    return &convertRTF($input_filename, $output_filestem, $output_type);
228    } else {
229    return &convertAnything($input_filename, $output_filestem, $output_type);
230    }
231}
232
233# Convert a Microsoft word 6/7/8 document
234
235sub convertWord678 {
236    my ($input_filename, $output_filestem, $output_type) = @_;
237
238    my $success = 0;
239    if (!$output_type || ($output_type =~ m/html/i)){
240    if ($windows_scripting) {
241        $success = &native_doc_to_html($input_filename, $output_filestem);
242    }
243    else {
244        $success = &doc_to_html($input_filename, $output_filestem);   
245    }
246    if ($success) {
247       return "html";
248    }
249    }
250    return &convertAnything($input_filename, $output_filestem, $output_type);
251}
252
253
254# Convert a Rich Text Format (RTF) file
255
256sub convertRTF {
257    my ($input_filename, $output_filestem, $output_type) = @_;
258
259    my $success = 0;
260
261    # Attempt specialised conversion to HTML
262    if (!$output_type || ($output_type =~ m/html/i)) {
263
264    if ($windows_scripting) {
265        $success = &native_doc_to_html($input_filename, $output_filestem);
266    }
267    else {
268        $success = &rtf_to_html($input_filename, $output_filestem);
269    }
270    if ($success) {
271        return "html";
272    }
273    }
274
275# rtf is so ugly that's it's not worth running strings over.
276# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
277#    return &convertAnything($input_filename, $output_filestem, $output_type);
278    return "fail";
279}
280
281
282# Convert an unidentified file
283
284sub convertAnything {
285    my ($input_filename, $output_filestem, $output_type) = @_;
286   
287    my $success = 0;
288 
289    # Attempt simple conversion to HTML
290    if (!$output_type || ($output_type =~ m/html/i)) {
291    $success = &any_to_html($input_filename, $output_filestem);
292    if ($success) {
293        return "html";
294    }
295    }
296
297    # Convert to text
298    if (!$output_type || ($output_type =~ m/text/i)) {
299    $success = &any_to_text($input_filename, $output_filestem);
300    if ($success) {
301        return "text";
302    }
303    }
304    return "fail";
305}
306
307
308
309# Convert an Adobe PDF document
310
311sub convertPDF {
312    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
313
314    my $success = 0;
315    $output_type =~ s/.*\-(.*)/$1/i;
316    # Attempt coversion to Image
317    if ($output_type =~ m/jp?g|gif|png/i) {
318    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
319    if ($success){
320        return "item";
321    }
322    }
323
324    # Attempt conversion to HTML
325    # Uses the old pdftohtml that doesn't work for newer PDF versions
326    if ($output_type =~ m/^html/i) {
327    #if (!$output_type || ($output_type =~ m/^html/i)) {
328    $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
329    if ($success) {
330        return "html";
331    }
332    }
333
334    # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This
335    # will be the new default for PDFs when output_type for PDF docs is not specified
336    # (once our use of xpdftools' pdftohtml has been implemented on win and mac).
337    #if ($output_type =~ m/paged_html/i) {
338    if (!$output_type || ($output_type =~ m/paged_html/i)) {
339    $success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
340    if ($success) {
341        return "paged_html";
342    }
343    }
344
345    # Attempt conversion to TEXT
346    if (!$output_type || ($output_type =~ m/text/i)) {
347    $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
348    if ($success) {
349        return "text";
350    }
351    }
352
353    return "fail";
354
355}
356
357
358# Convert an Adobe PostScript document
359
360sub convertPS {
361    my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
362
363    my $success = 0;
364    $output_type =~ s/.*\-(.*)/$1/i;
365    # Attempt coversion to Image
366    if ($output_type =~ m/jp?g|gif|png/i) {
367    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
368    if ($success){
369        return "item";
370    }
371    }
372
373    # Attempt conversion to TEXT
374    if (!$output_type || ($output_type =~ m/text/i)) {
375    $success = &ps_to_text($input_filename, $output_filestem);
376    if ($success) {
377        return "text";
378    }
379    }
380    return "fail";
381}
382
383
384sub convertPPT {
385    my ($input_filename, $output_filestem, $output_type) = @_;
386    my $success = 0;
387
388    my $ppt_convert_type = "";
389
390    #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
391    if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
392    if ($output_type =~ m/gif/i) {
393        $ppt_convert_type = "-g";
394    } elsif ($output_type =~ m/jp?g/i){
395        $ppt_convert_type = "-j";
396    } elsif ($output_type =~ m/png/i){
397        $ppt_convert_type = "-p";
398    }
399    my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
400                       $ENV{'GSDLOS'}, "pptextract");
401    $vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
402    # $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
403           
404    my $cmd = "";
405    if ($timeout) {$cmd = "ulimit -t $timeout;";}
406    # if the converting directory already exists
407    if (-d $output_filestem) {
408        print STDERR "**The conversion directory already exists\n";
409        return "item";
410    } else {
411        $cmd .=  "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
412        $cmd .= " 2>\"$output_filestem.err\""
413        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
414
415        if (system($cmd) !=0) {
416        print STDERR "Powerpoint VB Scripting convert failed\n";
417        } else {
418        return "item";
419        }
420    }
421    } elsif (!$output_type || ($output_type =~ m/html/i)) {
422    # Attempt conversion to HTML
423    #if (!$output_type || ($output_type =~ m/html/i)) {
424    # formulate the command
425    my $cmd = "";
426    my $full_perl_path = &util::get_perl_exec();
427    $cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
428    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
429    $cmd .= " 2>\"$output_filestem.err\""
430        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
431
432    # execute the command
433    $!=0;
434    if (system($cmd)!=0)
435    {
436        print STDERR "Powerpoint 95/97 converter failed $!\n";
437    } else {
438        return "html";
439    }
440    }
441
442    $success = &any_to_text($input_filename, $output_filestem);
443    if ($success) {
444    return "text";
445    }
446   
447    return "fail";
448}
449
450
451sub convertXLS {
452    my ($input_filename, $output_filestem, $output_type) = @_;
453
454    my $success = 0;
455
456    # Attempt conversion to HTML
457    if (!$output_type || ($output_type =~ m/html/i)) {
458    # formulate the command
459    my $cmd = "";
460    my $full_perl_path = &util::get_perl_exec();
461    $cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
462    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
463    $cmd .= " 2>\"$output_filestem.err\""
464        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
465   
466   
467    # execute the command
468    $!=0;
469    if (system($cmd)!=0)
470    {
471        print STDERR "Excel 95/97 converter failed $!\n";
472    } else {
473        return "html";
474    }
475    }
476
477    $success = &any_to_text($input_filename, $output_filestem);
478    if ($success) {
479    return "text";
480    }
481
482    return "fail";
483}
484
485
486
487# Find the real type of a .doc file
488#
489# We seem to have a lot of files with a .doc extension that are .rtf
490# files or Word 5 files.  This function attempts to tell the difference.
491sub find_docfile_type {
492    my ($input_filename) = @_;
493   
494    if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
495        return "docx";
496    }
497   
498    open(CHK, "<$input_filename");
499    binmode(CHK);
500    my $line = "";
501    my $first = 1;
502
503    while (<CHK>) {
504   
505    $line = $_;
506
507    if ($first) {
508        # check to see if this is an rtf file
509        if ($line =~ m/^\{\\rtf/) {
510        close(CHK);
511        return "rtf";
512        }
513        $first = 0;
514    }
515   
516    # is this is a word 6/7/8 document?
517    if ($line =~ m/Word\.Document\.([678])/) {
518        close(CHK);
519
520        return "word$1";
521    }
522
523    }
524
525    return "unknown";
526}
527
528
529# Specific type-to-type conversions
530#
531# Each of the following functions attempts to convert a document from
532# a specific format to another.  If they succeed they return 1 and leave
533# the output document(s) in the appropriate place; if they fail they
534# return 0 and delete any working files.
535
536
537# Attempt to convert a word document to html with the wv program
538sub doc_to_html {
539    my ($input_filename, $output_filestem) = @_;
540
541    my $wvware_status = 0;
542   
543    # need to ensure that the path to perl is quoted (in case there's spaces in it)
544    my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";   
545
546#    print STDERR "***** wvware launch cmd = $launch_cmd\n";
547
548    $wvware_status = system($launch_cmd)/256;
549    return $wvware_status;
550}
551
552# Attempt to convert a word document to html with the word2html scripting program
553sub native_doc_to_html {
554    my ($input_filename, $output_filestem) = @_;
555
556    # build up the path to the doc-to-html conversion tool we're going to use
557    my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
558
559    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
560        # if windows scripting with docx input, use new VBscript to get the local Word install (if
561        # any) to do the conversion, since docX can't be processed by word2html's windows_scripting
562       
563        if($input_filename =~ m/docx$/i) {  # need to use full path to docx2html script,
564                                            # else script launch fails when there are error msgs
565            $vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
566            $vbScript = "CScript //Nologo \"$vbScript\"";   # launch with CScript for error output in STDERR
567                                    # //Nologo flag avoids Microsoft's opening/logo msgs
568            print STDERR "About to use windows scripting to process docx file $input_filename.\n";
569            print STDERR "   This may take some time. Please wait...\n";
570        }
571        else {  # old doc versions. use the usual VB executable word2html for the
572                # conversion. Doesn't need full path, since bin\windows is on PATH         
573            $vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
574        }
575    }
576    else { # not windows
577        $vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
578    }
579
580    if (-e "$output_filestem.html") {
581    print STDERR "    The conversion file:\n";
582    print STDERR "      $output_filestem.html\n";
583    print STDERR "    ... already exists.  Skipping\n";
584    return 1;
585    }
586
587    my $cmd = "";
588    if ($timeout) {$cmd = "ulimit -t $timeout;";}
589    #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
590    #$cmd .=  "$vbScript $input_filename $output_filestem.html";
591    $cmd .=  "$vbScript \"$input_filename\" \"$output_filestem.html\"";
592
593    # redirecting STDERR
594   
595    $cmd .= " 2> \"$output_filestem.err\""
596        if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);   
597    #print STDERR "@@@@@@@@@ cmd=$cmd\n";
598   
599    # execute the command
600    $!=0;
601    if (system($cmd)!=0)
602    {
603    print STDERR "Error executing $vbScript converter:$!\n";
604    if (-s "$output_filestem.err") {
605        open (ERRFILE, "<$output_filestem.err");
606       
607        my $write_to_fail_log=0;
608        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
609        {$write_to_fail_log=1;}
610
611        my $line;
612        while ($line=<ERRFILE>) {
613        if ($line =~ m/\w/) {
614            print STDERR "$line";
615            print FAILLOG "$line" if ($write_to_fail_log);
616        }
617        if ($line !~ m/startup error/) {next;}
618        print STDERR " (given an invalid .DOC file?)\n";
619        print FAILLOG " (given an invalid .DOC file?)\n"
620        if ($write_to_fail_log);
621       
622        } # while ERRFILE
623        close FAILLOG if ($write_to_fail_log);
624    }
625    return 0; # we can try any_to_text
626    }
627
628    # Was the conversion successful?
629    if (-s "$output_filestem.html") {
630    open(TMP, "$output_filestem.html");
631    my $line = <TMP>;
632    close(TMP);
633    if ($line && $line =~ m/html/i) {
634        &FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
635        return 1;
636    }
637    }
638   
639    # If here, an error of some sort occurred
640    &FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
641    if (-e "$output_filestem.err") {
642    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
643        open (ERRLOG,"$output_filestem.err");
644        while (<ERRLOG>) {print FAILLOG $_;}
645        close FAILLOG;
646        close ERRLOG;
647    }
648    &FileUtils::removeFiles("$output_filestem.err");
649    }
650    return 0;
651}
652
653# Attempt to convert an RTF document to html with rtftohtml
654sub rtf_to_html {
655    my ($input_filename, $output_filestem) = @_;
656
657    # formulate the command
658    my $cmd = "";
659    if ($timeout) {$cmd = "ulimit -t $timeout;";}
660    $cmd .= "rtftohtml";
661    #$cmd .= "rtf-converter";
662
663    $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
664
665    $cmd .= " 2>\"$output_filestem.err\""
666        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
667
668
669    # execute the command
670    $!=0;
671    if (system($cmd)!=0)
672    {
673    print STDERR "Error executing rtf converter $!\n";
674    # don't currently bother printing out error log...
675    # keep going, in case it still created an HTML file...
676    }
677
678    # Was the conversion successful?
679    my $was_successful=0;
680    if (-s "$output_filestem.html") {
681    # make sure we have some content other than header
682    open (HTML, "$output_filestem.html"); # what to do if fail?
683    my $line;
684    my $past_header=0;
685    while ($line=<HTML>) {
686
687        if ($past_header == 0) {
688        if ($line =~ m/<body>/) {$past_header=1;}
689        next;
690        }
691
692        $line =~ s/<[^>]+>//g;
693        if ($line =~ m/\w/ && $past_header) {  # we found some content...
694        $was_successful=1;
695        last;
696        }
697    }
698    close HTML;
699    }
700
701    if ($was_successful) {
702    &FileUtils::removeFiles("$output_filestem.err")
703        if (-e "$output_filestem.err");
704    # insert the (modified) table of contents, if it exists.
705    if (-e "${output_filestem}_ToC.html") {
706        &FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
707        my $open_failed=0;
708        open HTMLSRC, "$output_filestem.src" || ++$open_failed;
709        open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
710        open HTML, ">$output_filestem.html" || ++$open_failed;
711       
712        if ($open_failed) {
713        close HTMLSRC;
714        close TOC;
715        close HTML;
716        &FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
717        return 1;
718        }
719
720        # print out header info from src html.
721        while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
722        print HTML "$_";
723        }
724
725        # print out table of contents, making links relative
726        <TOC>; <TOC>; # ignore first 2 lines
727        print HTML scalar(<TOC>); # line 3 = "<ol>\n"
728        my $line;
729        while ($line=<TOC>) {
730        $line =~ s@</body></html>$@@i ; # only last line has this
731        # make link relative
732        $line =~ s@href=\"[^\#]+@href=\"@i;
733        print HTML $line;
734        }
735        close TOC;
736
737        # rest of html src
738        while (<HTMLSRC>) {
739        print HTML $_;
740        }
741        close HTMLSRC;
742        close HTML;
743
744        &FileUtils::removeFiles("${output_filestem}_ToC.html");
745        &FileUtils::removeFiles("${output_filestem}.src");
746    }
747    # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
748    return 1; # success
749    }
750
751    if (-e "$output_filestem.err") {
752    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
753    {
754        print FAILLOG "Error - rtftohtml - couldn't extract text\n";
755        #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
756        print FAILLOG " (rtf file might be too recent):\n";
757        open (ERRLOG, "$output_filestem.err");
758        while (<ERRLOG>) {print FAILLOG $_;}
759        close ERRLOG;
760        close FAILLOG;
761    }
762    &FileUtils::removeFiles("$output_filestem.err");
763    }
764
765    &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
766
767    return 0;
768}
769
770
771# Convert a pdf file to html with the old pdftohtml command
772# which only works for older PDF versions
773sub pdf_to_html {
774    my ($dirname, $input_filename, $output_filestem) = @_;
775
776    my $cmd = "";
777    if ($timeout) {$cmd = "ulimit -t $timeout;";}
778    my $full_perl_path = &util::get_perl_exec();
779    $cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
780    $cmd .= " -c" if ($pdf_complex);
781    $cmd .= " -i" if ($pdf_ignore_images);
782    $cmd .= " -a" if ($pdf_allow_images_only);
783    $cmd .= " -hidden" unless ($pdf_nohidden);
784    $cmd .= " \"$input_filename\" \"$output_filestem\"";
785   
786    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
787    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
788    } else {
789    $cmd .= " > \"$output_filestem.err\"";
790    }
791
792    $!=0;
793
794    my $retval=system($cmd);
795    if ($retval!=0)
796    {
797    print STDERR "Error executing pdftohtml.pl";
798    if ($!) {print STDERR ": $!";}
799    print STDERR "\n";
800    }
801
802    # make sure the converter made something
803    if ($retval!=0 || ! -s "$output_filestem.html")
804    {
805    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
806    # print out the converter's std err, if any
807    if (-s "$output_filestem.err") {
808        open (ERRLOG, "$output_filestem.err") || die "$!";
809        print STDERR "pdftohtml error log:\n";
810        while (<ERRLOG>) {
811        print STDERR "$_";
812        }
813        close ERRLOG;
814    }
815    #print STDERR "***********output filestem $output_filestem.html\n";
816    &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
817    if (-e "$output_filestem.err") {
818        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
819        {
820        open (ERRLOG, "$output_filestem.err");
821        while (<ERRLOG>) {print FAILLOG $_;}
822        close ERRLOG;
823        close FAILLOG;
824        }   
825        &FileUtils::removeFiles("$output_filestem.err");
826    }
827    return 0;
828    }
829
830    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
831    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
832    return 1;
833}
834
835
836# Convert a pdf file to html with the newer Xpdftools' pdftohtml
837# This generates "paged HTML" where extracted, selectable text is positioned
838# over screenshots of each page.
839# Since xpdf's pdftohtml fails if the output dir already exists and for easier
840# naming, the output files are created in a "pages" subdirectory of the tmp
841# location parent of $output_filestem instead
842sub xpdf_to_html {
843    my ($dirname, $input_filename, $output_filestem) = @_;
844
845    my $cmd = "";
846
847    # build up the path to the doc-to-html conversion tool we're going to use
848    my $xpdf_pdftohtml = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools");
849   
850    if($ENV{'GSDLOS'} =~ m/^windows$/i) { # For Windows, just use the 32 bit xpdf's pdftohtml as it works the same as the 64 bit
851    $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32");
852    } else { # unix (linux|darwin), use the bin32/bin64 folder depending on the BITNESS env var
853   
854    # Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since
855    # $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit
856    # specific subdirectories exist in a greenstone installation.
857    # None of those locations need exist when xpdf-tools is installed with GS.
858    # So don't depend on GSDLARCH as forcing that to be exported has side-effects
859    if($ENV{'BITNESS'}) {
860        $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin".$ENV{'BITNESS'});
861    } else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64?
862        $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32");
863    }
864    }
865   
866    # We'll create the file by name $output_filestem during post-conversion processing.
867    # Note that Xpdf tools will only create its conversion products in a dir that does
868    # not yet exist. So we'll create this location as a subdir of the output_filestem's
869    # parent directory. The parent dir is the already generated tmp area for conversion. So:
870    # - tmpdir gs2build/tmp/<random-num> already exists at this stage
871    # - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
872    # - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
873    my ($tailname, $tmp_dirname, $suffix)
874    = &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
875    $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
876
877    $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "pdftohtml");
878    # xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
879    $cmd .= "\"$xpdf_pdftohtml\"";
880    $cmd .= " -z $pdf_zoom" if ($pdf_zoom);
881#    $cmd .= " -c" if ($pdf_complex);
882#    $cmd .= " -i" if ($pdf_ignore_images);
883#    $cmd .= " -a" if ($pdf_allow_images_only);
884#    $cmd .= " -hidden" unless ($pdf_nohidden);   
885    $cmd .= " \"$input_filename\" \"$tmp_dirname\"";
886    #$cmd .= " \"$input_filename\" \"$output_filestem\"";
887
888    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
889    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
890    } else {
891    $cmd .= " > \"$output_filestem.err\"";
892    }
893
894    #print STDERR "@@@@ Running command: $cmd\n";
895
896    $!=0;
897    my $retval=system($cmd);
898    if ($retval!=0)
899    {
900    print STDERR "Error executing xpdf's pdftohtml tool";
901    if ($!) {print STDERR ": $!";}
902    print STDERR "\n";
903    }
904
905    # make sure the converter made something
906    if ($retval!=0 || ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
907    {
908    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
909    # print out the converter's std err, if any
910    if (-s "$output_filestem.err") {
911        open (ERRLOG, "$output_filestem.err") || die "$!";
912        print STDERR "pdftohtml error log:\n";
913        while (<ERRLOG>) {
914        print STDERR "$_";
915        }
916        close ERRLOG;
917    }
918    #print STDERR "***********output filestem $output_filestem.html\n";
919    &FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
920    if (-e "$output_filestem.err") {
921        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
922        {
923        open (ERRLOG, "$output_filestem.err");
924        while (<ERRLOG>) {print FAILLOG $_;}
925        close ERRLOG;
926        close FAILLOG;
927        }   
928        &FileUtils::removeFiles("$output_filestem.err");
929    }
930    return 0;
931    }
932
933    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
934    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
935    return 1;
936}
937
938
939
940# Convert a pdf file to various types of image with the convert command
941
942sub pdfps_to_img {
943    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
944
945    # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
946    if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
947    my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
948    $imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
949    my $result = `$imagick_cmd identify 2>&1`;
950
951    # Linux and Windows return different values for "program not found".
952    # Linux returns -1 and Windows 256 for "program not found". But once they're
953    # converted to signed values, it will be -1 for Linux and 1 for Windows.
954    # Whenever we test for return values other than 0, shift by 8 and perform
955    # unsigned to signed status conversion on $? to get expected range of return vals
956    # Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
957    # and then exits on that, by the time we get here, we need to do it again
958    my $status = $?;
959    $status >>= 8;
960    $status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);   
961    if (($ENV{'GSDLOS'} ne "windows" && $status == -1) || ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
962        # if ($status == -1 || $status == 1) #if ($status == -1 || $status == 256) {
963        #ImageMagick is not installed, thus the convert utility is not available.
964        print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
965        return 0;
966    }
967    }
968
969    my $cmd = "";
970    if ($timeout) {$cmd = "ulimit -t $timeout;";}
971    $output_type =~ s/.*\_(.*)/$1/i;
972    my $full_perl_path = &util::get_perl_exec();
973    $cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
974    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
975    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
976    } else {
977    $cmd .= " > \"$output_filestem.err\"";
978    }
979
980    # don't include path on windows (to avoid having to play about
981    # with quoting when GSDLHOME might contain spaces) but assume
982    # that the PATH is set up correctly
983    $!=0;
984    my $retval=system($cmd);
985    if ($retval!=0)
986    {
987    print STDERR "Error executing pdfpstoimg.pl";
988    if ($!) {print STDERR ": $!";}
989    print STDERR "\n";
990    }
991
992    #make sure the converter made something
993    #if ($retval !=0) || ! -s "$output_filestem")
994    if ($retval !=0)
995    {
996    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
997    #print out the converter's std err, if any
998    if (-s "$output_filestem.err") {
999        open (ERRLOG, "$output_filestem.err") || die "$!";
1000        print STDERR "pdfpstoimg error log:\n";
1001        while (<ERRLOG>) {
1002        print STDERR "$_";
1003        }
1004        close ERRLOG;
1005    }
1006    #&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
1007    if (-e "$output_filestem.err") {
1008        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1009        {
1010        open (ERRLOG, "$output_filestem.err");
1011        while (<ERRLOG>) {print FAILLOG $_;}
1012        close ERRLOG;
1013        close FAILLOG;
1014       }   
1015        &FileUtils::removeFiles("$output_filestem.err");
1016    }
1017    return 0;
1018    }
1019    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1020    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1021    return 1;
1022}
1023
1024# Convert a PDF file to text with the pdftotext command
1025
1026sub pdf_to_text {
1027    my ($dirname, $input_filename, $output_filestem) = @_;
1028
1029    my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1030
1031    if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1032    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1033    } else {
1034    $cmd .= " > \"$output_filestem.err\"";
1035    }
1036   
1037    if (system($cmd)!=0)
1038    {
1039    print STDERR "Error executing $cmd: $!\n";
1040    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1041    }
1042
1043    # make sure there is some extracted text.
1044    if (-e "$output_filestem.text") {
1045    open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
1046    binmode(EXTR_TEXT); # just in case...
1047    my $line="";
1048    my $seen_text=0;
1049    while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1050        if ($line=~ m/\w/) {$seen_text=1;}
1051    }
1052    close EXTR_TEXT;
1053    if ($seen_text==0) { # no text was extracted
1054        print STDERR "Error: pdftotext found no text\n";
1055        &FileUtils::removeFiles("$output_filestem.text");
1056    }
1057    }
1058
1059    # make sure the converter made something
1060    if (! -s "$output_filestem.text")
1061    {
1062    # print out the converters std err, if any
1063    if (-s "$output_filestem.err") {
1064        open (ERRLOG, "$output_filestem.err") || die "$!";
1065        print STDERR "pdftotext error log:\n";
1066        while (<ERRLOG>) {
1067        print STDERR "$_";
1068        }
1069        close ERRLOG;
1070    }
1071    # does this converter create a .out file?
1072    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1073    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1074    if (-e "$output_filestem.err") {
1075        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1076        {
1077        open (ERRLOG,"$output_filestem.err");
1078        while (<ERRLOG>) {print FAILLOG $_;}
1079        close ERRLOG;
1080        close FAILLOG;
1081        }
1082        &FileUtils::removeFiles("$output_filestem.err");
1083    }
1084    return 0;
1085    }
1086    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1087    return 1;
1088}
1089
1090# Convert a PostScript document to text
1091# note - just using "ps2ascii" isn't good enough, as it
1092# returns 0 for a postscript interpreter error. ps2ascii is just
1093# a wrapper to "gs" anyway, so we use that cmd here.
1094
1095sub ps_to_text {
1096    my ($input_filename, $output_filestem) = @_;
1097
1098    my $error = "";
1099
1100    # if we're on windows we'll fall straight through without attempting
1101    # to use gs
1102    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1103    $error = "Windows does not support gs";
1104
1105    } else {
1106    my $cmd = "";
1107    if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1108    $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1109    $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1110    #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1111    $cmd .= " 2> $output_filestem.err";
1112    $!=0;
1113
1114    my $retcode=system($cmd);
1115    $retcode = $? >> 8;  # see man perlfunc - system for this...
1116    # if system returns -1 | 127 (couldn't start program), look at $! for message
1117
1118    if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1119    elsif (! -e "$output_filestem.text") {
1120        $error="did not create output file.\n";
1121    }
1122    else
1123    {   # make sure the interpreter didn't get an error. It is technically
1124        # possible for the actual text to start with this, but....
1125        open PSOUT, "$output_filestem.text";
1126        if (<PSOUT> =~ m/^Error: (.*)/) {
1127        $error="interpreter error - \"$1\"";
1128        }
1129        close PSOUT;
1130    }
1131    }
1132
1133    if ($error ne "")
1134    {
1135    print STDERR "Warning: Error executing gs: $error\n";
1136    print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
1137    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1138
1139    if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1140    {
1141        print FAILLOG "gs - $error\n";
1142        if (-e "$output_filestem.err") {
1143        open(ERRLOG, "$output_filestem.err");
1144        while (<ERRLOG>) {print FAILLOG $_;}
1145        close ERRLOG;
1146        }
1147        close FAILLOG;
1148    }
1149    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1150
1151
1152    # Fine then. We'll just do a lousy job by ourselves...
1153    # Based on 5-line regexp sed script found at:
1154    # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1155    #
1156    print STDERR "Stripping text from postscript\n";
1157    my $errorcode=0;
1158    open (IN, "$input_filename")
1159        ||  ($errorcode=1, warn "Couldn't read file: $!");
1160    open (OUT, ">$output_filestem.text")
1161        ||  ($errorcode=1, warn "Couldn't write file: $!");
1162    if ($errorcode) {print STDERR "errors\n";return 0;}
1163   
1164    my $text="";  # this is for whole .ps file...
1165    $text = join('', <IN>); # see man perlport, under "System Resources"
1166    close IN;
1167
1168    # Make sure this is a ps file...
1169    if ($text !~ m/^%!/) {
1170        print STDERR "Bad postscript header: not '%!'\n";
1171        if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1172        {
1173        print FAILLOG "Bad postscript header: not '%!'\n";
1174        close FAILLOG;
1175        }
1176        return 0;
1177    }
1178
1179    # if ps has Page data, then use it to delete all stuff before it.
1180    $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1181   
1182    # remove all leading non-data stuff
1183    $text =~ s/^.*?\(//s;
1184
1185    # remove all newline chars for easier processing
1186    $text =~ s/\n//g;
1187   
1188    # Big assumption here - assume that if any co-ordinates are
1189    # given, then we are at the end of a sentence.
1190    $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1191
1192    # special characters--
1193    $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1194
1195    # ? ps text formatting (eg italics?) ?
1196    $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1197    $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1198    $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1199    # default - remove the rest
1200    $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1201
1202    # attempt to add whitespace between words...
1203    # this is based purely on observation, and may be completely wrong...
1204    $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1205    # eg I notice "b(" is sometimes NOT a space if preceded by a
1206    # negative number.
1207    $text =~ s/\)\d+ ?b\(/\) \( /g;
1208
1209    # change quoted braces to brackets
1210    $text =~ s/([^\\])\\\(/$1\{/g;
1211    $text =~ s/([^\\])\\\)/$1\}/g ;
1212
1213    # remove everything that is not between braces
1214    $text =~ s/\)([^\(\)])+?\(//sg ;
1215   
1216    # remove any Trailer eof stuff.
1217    $text =~ s/\)[^\)]*$//sg;
1218
1219    ### ligatures have special characters...
1220    $text =~ s/\\013/ff/g;
1221    $text =~ s/\\014/fi/g;
1222    $text =~ s/\\015/fl/g;
1223    $text =~ s/\\016/ffi/g;
1224    $text =~ s/\\214/fi/g;
1225    $text =~ s/\\215/fl/g;
1226    $text =~ s/\\017/\n\* /g; # asterisk?
1227    $text =~ s/\\023/\023/g;  # e acute ('e)
1228    $text =~ s/\\177/\252/g;  # u"
1229#   $text =~ s/ ?? /\344/g;  # a"
1230
1231    print OUT "$text";
1232    close OUT;
1233    }
1234    # wrap the text - use a minimum length. ie, first space after this length.
1235    my $wrap_length=72;
1236    &FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1237    open INFILE, "$output_filestem.text.tmp" ||
1238    die "Couldn't open file: $!";
1239    open OUTFILE, ">$output_filestem.text" ||
1240    die "Couldn't open file for writing: $!";
1241    my $line="";
1242    while ($line=<INFILE>) {
1243    while (length($line)>0) {
1244        if (length($line)>$wrap_length) {
1245        $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1246        print OUTFILE "$1\n";
1247        } else {
1248        print OUTFILE "$line";
1249        $line="";
1250        }
1251    }
1252    }
1253    close INFILE;
1254    close OUTFILE;
1255    &FileUtils::removeFiles("$output_filestem.text.tmp");
1256
1257    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1258    return 1;
1259}
1260
1261
1262# Convert any file to HTML with a crude perl implementation of the
1263# UNIX strings command.
1264
1265sub any_to_html {
1266    my ($input_filename, $output_filestem) = @_;
1267
1268    # First generate a text file
1269    return 0 unless (&any_to_text($input_filename, $output_filestem));
1270
1271    # create an HTML file from the text file
1272    open(TEXT, "<$output_filestem.text");
1273    open(HTML, ">$output_filestem.html");
1274
1275    print HTML "<html><head>\n";
1276    print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1277    print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1278    print HTML "</head><body>\n\n";
1279
1280    my $line;
1281    while ($line=<TEXT>) {
1282    $line =~ s/</&lt;/g;
1283    $line =~ s/>/&gt;/g;
1284    if ($line =~ m/^\s*$/) {
1285        print HTML "<p>";
1286    } else {
1287        print HTML "<br> ", $line;
1288    }
1289    }
1290    print HTML "\n</body></html>\n";
1291
1292    close HTML;
1293    close TEXT;
1294
1295    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1296    return 1;
1297}
1298
1299# Convert any file to TEXT with a crude perl implementation of the
1300# UNIX strings command.
1301# Note - this assumes ascii charsets :(     (jrm21)
1302
1303sub any_to_text {
1304    my ($input_filename, $output_filestem) = @_;
1305
1306    if (!$use_strings) {
1307      return 0;
1308    }
1309
1310    print STDERR "\n**** In any to text****\n\n";
1311    open(IN, "<$input_filename") || return 0;
1312    binmode(IN);
1313    open(OUT, ">$output_filestem.text") || return 0;
1314
1315    my ($line);
1316    my $output_line_count = 0;
1317    while (<IN>) {
1318    $line = $_;
1319
1320    # delete anything that isn't a printable character
1321    $line =~ s/[^\040-\176]+/\n/sg;
1322
1323    # delete any string less than 10 characters long
1324    $line =~ s/^.{0,9}$/\n/mg;
1325    while ($line =~ m/^.{1,9}$/m) {
1326        $line =~ s/^.{0,9}$/\n/mg;
1327        $line =~ s/\n+/\n/sg;
1328    }
1329
1330    # remove extraneous whitespace
1331    $line =~ s/\n+/\n/gs;
1332    $line =~ s/^\n//gs;
1333
1334    # output whatever is left
1335    if ($line =~ m/[^\n ]/) {
1336        print OUT $line;
1337        ++$output_line_count;
1338    }
1339    }
1340
1341    close OUT;
1342    close IN;
1343
1344    if ($output_line_count) { # try to protect against binary only formats
1345    return 1;
1346    }
1347
1348    &FileUtils::removeFiles("$output_filestem.text");
1349    return 0;
1350
1351}
Note: See TracBrowser for help on using the browser.