root/main/trunk/greenstone2/bin/script/gsConvert.pl @ 32207

Revision 32207, 41.8 KB (checked in by ak19, 16 months ago)

Got a basic Windows version of PDFPlugin's new paged_html mode working

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs.  The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use strict;
52
53use parsargv;
54use util;
55use FileUtils;
56use Cwd;
57
58# Are we running on WinNT or Win2000 (or later)?
59my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62my $use_strings;
63my $pdf_complex;
64my $pdf_nohidden;
65my $pdf_zoom;
66my $pdf_ignore_images;
67my $pdf_allow_images_only;
68my $windows_scripting;
69
70sub print_usage
71{
72    print STDERR "\n";
73    print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
74    print STDERR "              or text using third-party programs.\n\n";
75    print STDERR "  usage: $0 [options] filename\n";
76    print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
77    print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
78    print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
79    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
80    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
81    print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
82    print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
83    print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
84    print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
85    print STDERR "\t\tconverting PDF to HTML\n";
86    print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
87    print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
88    print STDERR "\t\t-pdf_complex is set\n";
89    exit(1);
90}
91
92my $faillogfile="";
93my $timeout=0;
94my $verbosity=0;
95
96sub main
97{
98    my (@ARGV) = @_;
99    my ($input_type,$output_type,$verbose);
100
101    # Dynamically figure out what the --type option can support, based on whether -windows_scripting
102    # is in use or not
103    my $default_type_re = "(doc|dot|pdf|ps|ppt|rtf|xls)";
104    #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
105    #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
106    # Currently only have VBA for Word and PPT(but no XLS)
107    my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xls)";
108
109    my $type_re = $default_type_re;
110   
111    foreach my $a (@ARGV) {
112        if ($a =~ m/^windows_scripting$/i) {
113            $type_re = $enhanced_type_re;
114        }
115    }
116   
117    # read command-line arguments
118    if (!parsargv::parse(\@ARGV,
119             "type/$type_re/", \$input_type,
120             '/errlog/.*/', \$faillogfile,
121             'output/(auto|html|text|pagedimg).*/', \$output_type,
122             'timeout/\d+/0',\$timeout,
123             'verbose/\d+/0', \$verbose,
124             'windows_scripting',\$windows_scripting,
125             'use_strings', \$use_strings,
126             'pdf_complex', \$pdf_complex,
127             'pdf_ignore_images', \$pdf_ignore_images,
128             'pdf_allow_images_only', \$pdf_allow_images_only,
129             'pdf_nohidden', \$pdf_nohidden,
130             'pdf_zoom/\d+/2', \$pdf_zoom
131             ))
132    {
133    print_usage();
134    }
135
136    $verbosity=$verbose if defined $verbose;
137     
138    # Make sure the input file exists and can be opened for reading
139    if (scalar(@ARGV!=1)) {
140    print_usage();
141    }
142
143    my $input_filename = $ARGV[0];
144    if (!-r $input_filename) {
145    print STDERR "Error: unable to open $input_filename for reading\n";
146    exit(1);
147    }
148
149    # Deduce filenames
150    my ($tailname,$dirname,$suffix)
151    = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
152    my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
153
154    if ($input_type eq "")
155    {
156    $input_type = lc (substr($suffix,1,length($suffix)-1));
157    }
158   
159    # Change to temporary working directory
160    my $stored_dir = cwd();
161    chdir ($dirname) || die "Unable to change to directory $dirname";
162
163    # Select convert utility
164    if (!defined $input_type) {
165    print STDERR "Error: No filename extension or input type defined\n";
166    exit(1);
167    }
168    elsif ($input_type =~ m/^docx?$/ || $input_type eq "dot") {
169    print &convertDOC($input_filename, $output_filestem, $output_type);
170    print "\n";
171    }
172    elsif ($input_type eq "rtf") {
173    print &convertRTF($input_filename, $output_filestem, $output_type);
174    print "\n";
175    }
176    elsif ($input_type eq "pdf") {
177    print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
178    print "\n";
179    }
180    elsif ($input_type eq "ps") {
181    print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
182    print "\n";
183    }
184    elsif ($input_type =~ m/pptx?$/) {
185    print &convertPPT($input_filename, $output_filestem, $output_type);
186    print "\n";
187    }
188    elsif ($input_type =~ m/xlsx?$/) {
189    print &convertXLS($input_filename, $output_filestem, $output_type);
190    print "\n";
191    }
192    else {
193    print STDERR "Error: Unable to convert type '$input_type'\n";
194    exit(1);
195    }
196   
197    # restore to original working directory
198    chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
199
200}
201
202&main(@ARGV);
203
204
205
206# Document-type conversion functions
207#
208# The following functions attempt to convert documents from their
209# input type to the specified output type.  If no output type was
210# given, then they first attempt HTML, and then TEXT.
211#
212# Each returns the output type ("html" or "text") or "fail" if no
213# conversion is possible.
214
215# Convert a Microsoft word document
216
217sub convertDOC {
218    my ($input_filename, $output_filestem, $output_type) = @_;
219
220    # Many .doc files are not in fact word documents!
221    my $realtype = &find_docfile_type($input_filename);
222
223    if ($realtype eq "word6" || $realtype eq "word7"
224        || $realtype eq "word8" || $realtype eq "docx") {
225    return &convertWord678($input_filename, $output_filestem, $output_type);
226    } elsif ($realtype eq "rtf") {
227    return &convertRTF($input_filename, $output_filestem, $output_type);
228    } else {
229    return &convertAnything($input_filename, $output_filestem, $output_type);
230    }
231}
232
233# Convert a Microsoft word 6/7/8 document
234
235sub convertWord678 {
236    my ($input_filename, $output_filestem, $output_type) = @_;
237
238    my $success = 0;
239    if (!$output_type || ($output_type =~ m/html/i)){
240    if ($windows_scripting) {
241        $success = &native_doc_to_html($input_filename, $output_filestem);
242    }
243    else {
244        $success = &doc_to_html($input_filename, $output_filestem);   
245    }
246    if ($success) {
247       return "html";
248    }
249    }
250    return &convertAnything($input_filename, $output_filestem, $output_type);
251}
252
253
254# Convert a Rich Text Format (RTF) file
255
256sub convertRTF {
257    my ($input_filename, $output_filestem, $output_type) = @_;
258
259    my $success = 0;
260
261    # Attempt specialised conversion to HTML
262    if (!$output_type || ($output_type =~ m/html/i)) {
263
264    if ($windows_scripting) {
265        $success = &native_doc_to_html($input_filename, $output_filestem);
266    }
267    else {
268        $success = &rtf_to_html($input_filename, $output_filestem);
269    }
270    if ($success) {
271        return "html";
272    }
273    }
274
275# rtf is so ugly that's it's not worth running strings over.
276# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
277#    return &convertAnything($input_filename, $output_filestem, $output_type);
278    return "fail";
279}
280
281
282# Convert an unidentified file
283
284sub convertAnything {
285    my ($input_filename, $output_filestem, $output_type) = @_;
286   
287    my $success = 0;
288 
289    # Attempt simple conversion to HTML
290    if (!$output_type || ($output_type =~ m/html/i)) {
291    $success = &any_to_html($input_filename, $output_filestem);
292    if ($success) {
293        return "html";
294    }
295    }
296
297    # Convert to text
298    if (!$output_type || ($output_type =~ m/text/i)) {
299    $success = &any_to_text($input_filename, $output_filestem);
300    if ($success) {
301        return "text";
302    }
303    }
304    return "fail";
305}
306
307
308
309# Convert an Adobe PDF document
310
311sub convertPDF {
312    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
313
314    my $success = 0;
315    $output_type =~ s/.*\-(.*)/$1/i;
316    # Attempt coversion to Image
317    if ($output_type =~ m/jp?g|gif|png/i) {
318    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
319    if ($success){
320        return "item";
321    }
322    }
323
324    # Attempt conversion to HTML
325    # Uses the old pdftohtml that doesn't work for newer PDF versions
326    #if ($output_type =~ m/^html/i) {
327    if (!$output_type || ($output_type =~ m/^html/i)) {
328    $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
329    if ($success) {
330        return "html";
331    }
332    }
333
334    # Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This
335    # will be the new default for PDFs when output_type for PDF docs is not specified
336    # (once our use of xpdftools' pdftohtml has been implemented on win and mac).
337    if ($output_type =~ m/paged_html/i) {
338    #if (!$output_type || ($output_type =~ m/paged_html/i)) {
339    $success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
340    if ($success) {
341        return "paged_html";
342    }
343    }
344
345    # Attempt conversion to TEXT
346    if (!$output_type || ($output_type =~ m/text/i)) {
347    $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
348    if ($success) {
349        return "text";
350    }
351    }
352
353    return "fail";
354
355}
356
357
358# Convert an Adobe PostScript document
359
360sub convertPS {
361    my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
362
363    my $success = 0;
364    $output_type =~ s/.*\-(.*)/$1/i;
365    # Attempt coversion to Image
366    if ($output_type =~ m/jp?g|gif|png/i) {
367    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
368    if ($success){
369        return "item";
370    }
371    }
372
373    # Attempt conversion to TEXT
374    if (!$output_type || ($output_type =~ m/text/i)) {
375    $success = &ps_to_text($input_filename, $output_filestem);
376    if ($success) {
377        return "text";
378    }
379    }
380    return "fail";
381}
382
383
384sub convertPPT {
385    my ($input_filename, $output_filestem, $output_type) = @_;
386    my $success = 0;
387
388    my $ppt_convert_type = "";
389
390    #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
391    if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
392    if ($output_type =~ m/gif/i) {
393        $ppt_convert_type = "-g";
394    } elsif ($output_type =~ m/jp?g/i){
395        $ppt_convert_type = "-j";
396    } elsif ($output_type =~ m/png/i){
397        $ppt_convert_type = "-p";
398    }
399    my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
400                       $ENV{'GSDLOS'}, "pptextract");
401    $vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
402    # $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
403           
404    my $cmd = "";
405    if ($timeout) {$cmd = "ulimit -t $timeout;";}
406    # if the converting directory already exists
407    if (-d $output_filestem) {
408        print STDERR "**The conversion directory already exists\n";
409        return "item";
410    } else {
411        $cmd .=  "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
412        $cmd .= " 2>\"$output_filestem.err\""
413        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
414
415        if (system($cmd) !=0) {
416        print STDERR "Powerpoint VB Scripting convert failed\n";
417        } else {
418        return "item";
419        }
420    }
421    } elsif (!$output_type || ($output_type =~ m/html/i)) {
422    # Attempt conversion to HTML
423    #if (!$output_type || ($output_type =~ m/html/i)) {
424    # formulate the command
425    my $cmd = "";
426    my $full_perl_path = &util::get_perl_exec();
427    $cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
428    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
429    $cmd .= " 2>\"$output_filestem.err\""
430        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
431
432    # execute the command
433    $!=0;
434    if (system($cmd)!=0)
435    {
436        print STDERR "Powerpoint 95/97 converter failed $!\n";
437    } else {
438        return "html";
439    }
440    }
441
442    $success = &any_to_text($input_filename, $output_filestem);
443    if ($success) {
444    return "text";
445    }
446   
447    return "fail";
448}
449
450
451sub convertXLS {
452    my ($input_filename, $output_filestem, $output_type) = @_;
453
454    my $success = 0;
455
456    # Attempt conversion to HTML
457    if (!$output_type || ($output_type =~ m/html/i)) {
458    # formulate the command
459    my $cmd = "";
460    my $full_perl_path = &util::get_perl_exec();
461    $cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
462    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
463    $cmd .= " 2>\"$output_filestem.err\""
464        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
465   
466   
467    # execute the command
468    $!=0;
469    if (system($cmd)!=0)
470    {
471        print STDERR "Excel 95/97 converter failed $!\n";
472    } else {
473        return "html";
474    }
475    }
476
477    $success = &any_to_text($input_filename, $output_filestem);
478    if ($success) {
479    return "text";
480    }
481
482    return "fail";
483}
484
485
486
487# Find the real type of a .doc file
488#
489# We seem to have a lot of files with a .doc extension that are .rtf
490# files or Word 5 files.  This function attempts to tell the difference.
491sub find_docfile_type {
492    my ($input_filename) = @_;
493   
494    if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
495        return "docx";
496    }
497   
498    open(CHK, "<$input_filename");
499    binmode(CHK);
500    my $line = "";
501    my $first = 1;
502
503    while (<CHK>) {
504   
505    $line = $_;
506
507    if ($first) {
508        # check to see if this is an rtf file
509        if ($line =~ m/^\{\\rtf/) {
510        close(CHK);
511        return "rtf";
512        }
513        $first = 0;
514    }
515   
516    # is this is a word 6/7/8 document?
517    if ($line =~ m/Word\.Document\.([678])/) {
518        close(CHK);
519
520        return "word$1";
521    }
522
523    }
524
525    return "unknown";
526}
527
528
529# Specific type-to-type conversions
530#
531# Each of the following functions attempts to convert a document from
532# a specific format to another.  If they succeed they return 1 and leave
533# the output document(s) in the appropriate place; if they fail they
534# return 0 and delete any working files.
535
536
537# Attempt to convert a word document to html with the wv program
538sub doc_to_html {
539    my ($input_filename, $output_filestem) = @_;
540
541    my $wvware_status = 0;
542   
543    # need to ensure that the path to perl is quoted (in case there's spaces in it)
544    my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";   
545
546#    print STDERR "***** wvware launch cmd = $launch_cmd\n";
547
548    $wvware_status = system($launch_cmd)/256;
549    return $wvware_status;
550}
551
552# Attempt to convert a word document to html with the word2html scripting program
553sub native_doc_to_html {
554    my ($input_filename, $output_filestem) = @_;
555
556    # build up the path to the doc-to-html conversion tool we're going to use
557    my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
558
559    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
560        # if windows scripting with docx input, use new VBscript to get the local Word install (if
561        # any) to do the conversion, since docX can't be processed by word2html's windows_scripting
562       
563        if($input_filename =~ m/docx$/i) {  # need to use full path to docx2html script,
564                                            # else script launch fails when there are error msgs
565            $vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
566            $vbScript = "CScript //Nologo \"$vbScript\"";   # launch with CScript for error output in STDERR
567                                    # //Nologo flag avoids Microsoft's opening/logo msgs
568            print STDERR "About to use windows scripting to process docx file $input_filename.\n";
569            print STDERR "   This may take some time. Please wait...\n";
570        }
571        else {  # old doc versions. use the usual VB executable word2html for the
572                # conversion. Doesn't need full path, since bin\windows is on PATH         
573            $vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
574        }
575    }
576    else { # not windows
577        $vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
578    }
579
580    if (-e "$output_filestem.html") {
581    print STDERR "    The conversion file:\n";
582    print STDERR "      $output_filestem.html\n";
583    print STDERR "    ... already exists.  Skipping\n";
584    return 1;
585    }
586
587    my $cmd = "";
588    if ($timeout) {$cmd = "ulimit -t $timeout;";}
589    #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
590    #$cmd .=  "$vbScript $input_filename $output_filestem.html";
591    $cmd .=  "$vbScript \"$input_filename\" \"$output_filestem.html\"";
592
593    # redirecting STDERR
594   
595    $cmd .= " 2> \"$output_filestem.err\""
596        if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);   
597    #print STDERR "@@@@@@@@@ cmd=$cmd\n";
598   
599    # execute the command
600    $!=0;
601    if (system($cmd)!=0)
602    {
603    print STDERR "Error executing $vbScript converter:$!\n";
604    if (-s "$output_filestem.err") {
605        open (ERRFILE, "<$output_filestem.err");
606       
607        my $write_to_fail_log=0;
608        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
609        {$write_to_fail_log=1;}
610
611        my $line;
612        while ($line=<ERRFILE>) {
613        if ($line =~ m/\w/) {
614            print STDERR "$line";
615            print FAILLOG "$line" if ($write_to_fail_log);
616        }
617        if ($line !~ m/startup error/) {next;}
618        print STDERR " (given an invalid .DOC file?)\n";
619        print FAILLOG " (given an invalid .DOC file?)\n"
620        if ($write_to_fail_log);
621       
622        } # while ERRFILE
623        close FAILLOG if ($write_to_fail_log);
624    }
625    return 0; # we can try any_to_text
626    }
627
628    # Was the conversion successful?
629    if (-s "$output_filestem.html") {
630    open(TMP, "$output_filestem.html");
631    my $line = <TMP>;
632    close(TMP);
633    if ($line && $line =~ m/html/i) {
634        &FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
635        return 1;
636    }
637    }
638   
639    # If here, an error of some sort occurred
640    &FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
641    if (-e "$output_filestem.err") {
642    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
643        open (ERRLOG,"$output_filestem.err");
644        while (<ERRLOG>) {print FAILLOG $_;}
645        close FAILLOG;
646        close ERRLOG;
647    }
648    &FileUtils::removeFiles("$output_filestem.err");
649    }
650    return 0;
651}
652
653# Attempt to convert an RTF document to html with rtftohtml
654sub rtf_to_html {
655    my ($input_filename, $output_filestem) = @_;
656
657    # formulate the command
658    my $cmd = "";
659    if ($timeout) {$cmd = "ulimit -t $timeout;";}
660    $cmd .= "rtftohtml";
661    #$cmd .= "rtf-converter";
662
663    $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
664
665    $cmd .= " 2>\"$output_filestem.err\""
666        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
667
668
669    # execute the command
670    $!=0;
671    if (system($cmd)!=0)
672    {
673    print STDERR "Error executing rtf converter $!\n";
674    # don't currently bother printing out error log...
675    # keep going, in case it still created an HTML file...
676    }
677
678    # Was the conversion successful?
679    my $was_successful=0;
680    if (-s "$output_filestem.html") {
681    # make sure we have some content other than header
682    open (HTML, "$output_filestem.html"); # what to do if fail?
683    my $line;
684    my $past_header=0;
685    while ($line=<HTML>) {
686
687        if ($past_header == 0) {
688        if ($line =~ m/<body>/) {$past_header=1;}
689        next;
690        }
691
692        $line =~ s/<[^>]+>//g;
693        if ($line =~ m/\w/ && $past_header) {  # we found some content...
694        $was_successful=1;
695        last;
696        }
697    }
698    close HTML;
699    }
700
701    if ($was_successful) {
702    &FileUtils::removeFiles("$output_filestem.err")
703        if (-e "$output_filestem.err");
704    # insert the (modified) table of contents, if it exists.
705    if (-e "${output_filestem}_ToC.html") {
706        &FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
707        my $open_failed=0;
708        open HTMLSRC, "$output_filestem.src" || ++$open_failed;
709        open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
710        open HTML, ">$output_filestem.html" || ++$open_failed;
711       
712        if ($open_failed) {
713        close HTMLSRC;
714        close TOC;
715        close HTML;
716        &FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
717        return 1;
718        }
719
720        # print out header info from src html.
721        while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
722        print HTML "$_";
723        }
724
725        # print out table of contents, making links relative
726        <TOC>; <TOC>; # ignore first 2 lines
727        print HTML scalar(<TOC>); # line 3 = "<ol>\n"
728        my $line;
729        while ($line=<TOC>) {
730        $line =~ s@</body></html>$@@i ; # only last line has this
731        # make link relative
732        $line =~ s@href=\"[^\#]+@href=\"@i;
733        print HTML $line;
734        }
735        close TOC;
736
737        # rest of html src
738        while (<HTMLSRC>) {
739        print HTML $_;
740        }
741        close HTMLSRC;
742        close HTML;
743
744        &FileUtils::removeFiles("${output_filestem}_ToC.html");
745        &FileUtils::removeFiles("${output_filestem}.src");
746    }
747    # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
748    return 1; # success
749    }
750
751    if (-e "$output_filestem.err") {
752    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
753    {
754        print FAILLOG "Error - rtftohtml - couldn't extract text\n";
755        #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
756        print FAILLOG " (rtf file might be too recent):\n";
757        open (ERRLOG, "$output_filestem.err");
758        while (<ERRLOG>) {print FAILLOG $_;}
759        close ERRLOG;
760        close FAILLOG;
761    }
762    &FileUtils::removeFiles("$output_filestem.err");
763    }
764
765    &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
766
767    return 0;
768}
769
770
771# Convert a pdf file to html with the old pdftohtml command
772# which only works for older PDF versions
773sub pdf_to_html {
774    my ($dirname, $input_filename, $output_filestem) = @_;
775
776    my $cmd = "";
777    if ($timeout) {$cmd = "ulimit -t $timeout;";}
778    my $full_perl_path = &util::get_perl_exec();
779    $cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
780    $cmd .= " -c" if ($pdf_complex);
781    $cmd .= " -i" if ($pdf_ignore_images);
782    $cmd .= " -a" if ($pdf_allow_images_only);
783    $cmd .= " -hidden" unless ($pdf_nohidden);
784    $cmd .= " \"$input_filename\" \"$output_filestem\"";
785   
786    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
787    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
788    } else {
789    $cmd .= " > \"$output_filestem.err\"";
790    }
791
792    $!=0;
793
794    my $retval=system($cmd);
795    if ($retval!=0)
796    {
797    print STDERR "Error executing pdftohtml.pl";
798    if ($!) {print STDERR ": $!";}
799    print STDERR "\n";
800    }
801
802    # make sure the converter made something
803    if ($retval!=0 || ! -s "$output_filestem.html")
804    {
805    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
806    # print out the converter's std err, if any
807    if (-s "$output_filestem.err") {
808        open (ERRLOG, "$output_filestem.err") || die "$!";
809        print STDERR "pdftohtml error log:\n";
810        while (<ERRLOG>) {
811        print STDERR "$_";
812        }
813        close ERRLOG;
814    }
815    #print STDERR "***********output filestem $output_filestem.html\n";
816    &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
817    if (-e "$output_filestem.err") {
818        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
819        {
820        open (ERRLOG, "$output_filestem.err");
821        while (<ERRLOG>) {print FAILLOG $_;}
822        close ERRLOG;
823        close FAILLOG;
824        }   
825        &FileUtils::removeFiles("$output_filestem.err");
826    }
827    return 0;
828    }
829
830    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
831    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
832    return 1;
833}
834
835
836# Convert a pdf file to html with the newer Xpdftools' pdftohtml
837# This generates "paged HTML" where extracted, selectable text is positioned
838# over screenshots of each page.
839# Since xpdf's pdftohtml fails if the output dir already exists and for easier
840# naming, the output files are created in a "pages" subdirectory of the tmp
841# location parent of $output_filestem instead
842sub xpdf_to_html {
843    my ($dirname, $input_filename, $output_filestem) = @_;
844
845    my $cmd = "";
846
847    # build up the path to the doc-to-html conversion tool we're going to use
848    my $xpdf_pdftohtml = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools");
849
850    if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
851    # TODO
852    } else { # unix or windows, use the appropriate bin folder for the bitness of the system
853        # In fact, when testing 3 different PDF docs, it doesn't seem to make a difference on
854        # 64 bit Windows whether the pdftohtml binary in the bin32 or bin64 folder is used.
855        # However, maybe we'll use another xpdf-tool too in future where bitness will be relevant.
856   
857    # Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since
858    # $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit
859    # specific subdirectories exist in a greenstone installation.
860    # None of those locations need exist when xpdf-tools is installed with GS.
861    # So don't depend on GSDLARCH as forcing that to be exported has side-effects
862    if($ENV{'BITNESS'}) {
863        $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin".$ENV{'BITNESS'});
864    } else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64?
865        $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32");
866    }
867    }
868
869    #print STDERR "@@@@ BITNESS: " . $ENV{'BITNESS'} . "\n";
870   
871    # We'll create the file by name $output_filestem during post-conversion processing.
872    # Note that Xpdf tools will only create its conversion products in a dir that does
873    # not yet exist. So we'll create this location as a subdir of the output_filestem's
874    # parent directory. The parent dir is the already generated tmp area for conversion. So:
875    # - tmpdir gs2build/tmp/<random-num> already exists at this stage
876    # - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
877    # - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
878    my ($tailname, $tmp_dirname, $suffix)
879    = &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
880    $tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
881
882    $xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "pdftohtml");
883    # xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
884    $cmd .= "\"$xpdf_pdftohtml\"";
885    $cmd .= " -z $pdf_zoom" if ($pdf_zoom);
886#    $cmd .= " -c" if ($pdf_complex);
887#    $cmd .= " -i" if ($pdf_ignore_images);
888#    $cmd .= " -a" if ($pdf_allow_images_only);
889#    $cmd .= " -hidden" unless ($pdf_nohidden);   
890    $cmd .= " \"$input_filename\" \"$tmp_dirname\"";
891    #$cmd .= " \"$input_filename\" \"$output_filestem\"";
892
893    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
894    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
895    } else {
896    $cmd .= " > \"$output_filestem.err\"";
897    }
898
899    #print STDERR "@@@@ Running command: $cmd\n";
900
901    $!=0;
902    my $retval=system($cmd);
903    if ($retval!=0)
904    {
905    print STDERR "Error executing xpdf's pdftohtml tool";
906    if ($!) {print STDERR ": $!";}
907    print STDERR "\n";
908    }
909
910    # make sure the converter made something
911    if ($retval!=0 || ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
912    {
913    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
914    # print out the converter's std err, if any
915    if (-s "$output_filestem.err") {
916        open (ERRLOG, "$output_filestem.err") || die "$!";
917        print STDERR "pdftohtml error log:\n";
918        while (<ERRLOG>) {
919        print STDERR "$_";
920        }
921        close ERRLOG;
922    }
923    #print STDERR "***********output filestem $output_filestem.html\n";
924    &FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
925    if (-e "$output_filestem.err") {
926        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
927        {
928        open (ERRLOG, "$output_filestem.err");
929        while (<ERRLOG>) {print FAILLOG $_;}
930        close ERRLOG;
931        close FAILLOG;
932        }   
933        &FileUtils::removeFiles("$output_filestem.err");
934    }
935    return 0;
936    }
937
938    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
939    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
940    return 1;
941}
942
943
944
945# Convert a pdf file to various types of image with the convert command
946
947sub pdfps_to_img {
948    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
949
950    # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
951    if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
952    my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
953    $imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
954    my $result = `$imagick_cmd identify 2>&1`;
955
956    # Linux and Windows return different values for "program not found".
957    # Linux returns -1 and Windows 256 for "program not found". But once they're
958    # converted to signed values, it will be -1 for Linux and 1 for Windows.
959    # Whenever we test for return values other than 0, shift by 8 and perform
960    # unsigned to signed status conversion on $? to get expected range of return vals
961    # Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
962    # and then exits on that, by the time we get here, we need to do it again
963    my $status = $?;
964    $status >>= 8;
965    $status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);   
966    if (($ENV{'GSDLOS'} ne "windows" && $status == -1) || ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
967        # if ($status == -1 || $status == 1) #if ($status == -1 || $status == 256) {
968        #ImageMagick is not installed, thus the convert utility is not available.
969        print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
970        return 0;
971    }
972    }
973
974    my $cmd = "";
975    if ($timeout) {$cmd = "ulimit -t $timeout;";}
976    $output_type =~ s/.*\_(.*)/$1/i;
977    my $full_perl_path = &util::get_perl_exec();
978    $cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
979    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
980    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
981    } else {
982    $cmd .= " > \"$output_filestem.err\"";
983    }
984
985    # don't include path on windows (to avoid having to play about
986    # with quoting when GSDLHOME might contain spaces) but assume
987    # that the PATH is set up correctly
988    $!=0;
989    my $retval=system($cmd);
990    if ($retval!=0)
991    {
992    print STDERR "Error executing pdfpstoimg.pl";
993    if ($!) {print STDERR ": $!";}
994    print STDERR "\n";
995    }
996
997    #make sure the converter made something
998    #if ($retval !=0) || ! -s "$output_filestem")
999    if ($retval !=0)
1000    {
1001    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1002    #print out the converter's std err, if any
1003    if (-s "$output_filestem.err") {
1004        open (ERRLOG, "$output_filestem.err") || die "$!";
1005        print STDERR "pdfpstoimg error log:\n";
1006        while (<ERRLOG>) {
1007        print STDERR "$_";
1008        }
1009        close ERRLOG;
1010    }
1011    #&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
1012    if (-e "$output_filestem.err") {
1013        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1014        {
1015        open (ERRLOG, "$output_filestem.err");
1016        while (<ERRLOG>) {print FAILLOG $_;}
1017        close ERRLOG;
1018        close FAILLOG;
1019       }   
1020        &FileUtils::removeFiles("$output_filestem.err");
1021    }
1022    return 0;
1023    }
1024    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1025    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1026    return 1;
1027}
1028
1029# Convert a PDF file to text with the pdftotext command
1030
1031sub pdf_to_text {
1032    my ($dirname, $input_filename, $output_filestem) = @_;
1033
1034    my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1035
1036    if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1037    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1038    } else {
1039    $cmd .= " > \"$output_filestem.err\"";
1040    }
1041   
1042    if (system($cmd)!=0)
1043    {
1044    print STDERR "Error executing $cmd: $!\n";
1045    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1046    }
1047
1048    # make sure there is some extracted text.
1049    if (-e "$output_filestem.text") {
1050    open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
1051    binmode(EXTR_TEXT); # just in case...
1052    my $line="";
1053    my $seen_text=0;
1054    while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1055        if ($line=~ m/\w/) {$seen_text=1;}
1056    }
1057    close EXTR_TEXT;
1058    if ($seen_text==0) { # no text was extracted
1059        print STDERR "Error: pdftotext found no text\n";
1060        &FileUtils::removeFiles("$output_filestem.text");
1061    }
1062    }
1063
1064    # make sure the converter made something
1065    if (! -s "$output_filestem.text")
1066    {
1067    # print out the converters std err, if any
1068    if (-s "$output_filestem.err") {
1069        open (ERRLOG, "$output_filestem.err") || die "$!";
1070        print STDERR "pdftotext error log:\n";
1071        while (<ERRLOG>) {
1072        print STDERR "$_";
1073        }
1074        close ERRLOG;
1075    }
1076    # does this converter create a .out file?
1077    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1078    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1079    if (-e "$output_filestem.err") {
1080        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1081        {
1082        open (ERRLOG,"$output_filestem.err");
1083        while (<ERRLOG>) {print FAILLOG $_;}
1084        close ERRLOG;
1085        close FAILLOG;
1086        }
1087        &FileUtils::removeFiles("$output_filestem.err");
1088    }
1089    return 0;
1090    }
1091    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1092    return 1;
1093}
1094
1095# Convert a PostScript document to text
1096# note - just using "ps2ascii" isn't good enough, as it
1097# returns 0 for a postscript interpreter error. ps2ascii is just
1098# a wrapper to "gs" anyway, so we use that cmd here.
1099
1100sub ps_to_text {
1101    my ($input_filename, $output_filestem) = @_;
1102
1103    my $error = "";
1104
1105    # if we're on windows we'll fall straight through without attempting
1106    # to use gs
1107    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1108    $error = "Windows does not support gs";
1109
1110    } else {
1111    my $cmd = "";
1112    if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1113    $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1114    $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1115    #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1116    $cmd .= " 2> $output_filestem.err";
1117    $!=0;
1118
1119    my $retcode=system($cmd);
1120    $retcode = $? >> 8;  # see man perlfunc - system for this...
1121    # if system returns -1 | 127 (couldn't start program), look at $! for message
1122
1123    if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1124    elsif (! -e "$output_filestem.text") {
1125        $error="did not create output file.\n";
1126    }
1127    else
1128    {   # make sure the interpreter didn't get an error. It is technically
1129        # possible for the actual text to start with this, but....
1130        open PSOUT, "$output_filestem.text";
1131        if (<PSOUT> =~ m/^Error: (.*)/) {
1132        $error="interpreter error - \"$1\"";
1133        }
1134        close PSOUT;
1135    }
1136    }
1137
1138    if ($error ne "")
1139    {
1140    print STDERR "Warning: Error executing gs: $error\n";
1141    print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
1142    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1143
1144    if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1145    {
1146        print FAILLOG "gs - $error\n";
1147        if (-e "$output_filestem.err") {
1148        open(ERRLOG, "$output_filestem.err");
1149        while (<ERRLOG>) {print FAILLOG $_;}
1150        close ERRLOG;
1151        }
1152        close FAILLOG;
1153    }
1154    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1155
1156
1157    # Fine then. We'll just do a lousy job by ourselves...
1158    # Based on 5-line regexp sed script found at:
1159    # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1160    #
1161    print STDERR "Stripping text from postscript\n";
1162    my $errorcode=0;
1163    open (IN, "$input_filename")
1164        ||  ($errorcode=1, warn "Couldn't read file: $!");
1165    open (OUT, ">$output_filestem.text")
1166        ||  ($errorcode=1, warn "Couldn't write file: $!");
1167    if ($errorcode) {print STDERR "errors\n";return 0;}
1168   
1169    my $text="";  # this is for whole .ps file...
1170    $text = join('', <IN>); # see man perlport, under "System Resources"
1171    close IN;
1172
1173    # Make sure this is a ps file...
1174    if ($text !~ m/^%!/) {
1175        print STDERR "Bad postscript header: not '%!'\n";
1176        if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1177        {
1178        print FAILLOG "Bad postscript header: not '%!'\n";
1179        close FAILLOG;
1180        }
1181        return 0;
1182    }
1183
1184    # if ps has Page data, then use it to delete all stuff before it.
1185    $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1186   
1187    # remove all leading non-data stuff
1188    $text =~ s/^.*?\(//s;
1189
1190    # remove all newline chars for easier processing
1191    $text =~ s/\n//g;
1192   
1193    # Big assumption here - assume that if any co-ordinates are
1194    # given, then we are at the end of a sentence.
1195    $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1196
1197    # special characters--
1198    $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1199
1200    # ? ps text formatting (eg italics?) ?
1201    $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1202    $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1203    $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1204    # default - remove the rest
1205    $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1206
1207    # attempt to add whitespace between words...
1208    # this is based purely on observation, and may be completely wrong...
1209    $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1210    # eg I notice "b(" is sometimes NOT a space if preceded by a
1211    # negative number.
1212    $text =~ s/\)\d+ ?b\(/\) \( /g;
1213
1214    # change quoted braces to brackets
1215    $text =~ s/([^\\])\\\(/$1\{/g;
1216    $text =~ s/([^\\])\\\)/$1\}/g ;
1217
1218    # remove everything that is not between braces
1219    $text =~ s/\)([^\(\)])+?\(//sg ;
1220   
1221    # remove any Trailer eof stuff.
1222    $text =~ s/\)[^\)]*$//sg;
1223
1224    ### ligatures have special characters...
1225    $text =~ s/\\013/ff/g;
1226    $text =~ s/\\014/fi/g;
1227    $text =~ s/\\015/fl/g;
1228    $text =~ s/\\016/ffi/g;
1229    $text =~ s/\\214/fi/g;
1230    $text =~ s/\\215/fl/g;
1231    $text =~ s/\\017/\n\* /g; # asterisk?
1232    $text =~ s/\\023/\023/g;  # e acute ('e)
1233    $text =~ s/\\177/\252/g;  # u"
1234#   $text =~ s/ ?? /\344/g;  # a"
1235
1236    print OUT "$text";
1237    close OUT;
1238    }
1239    # wrap the text - use a minimum length. ie, first space after this length.
1240    my $wrap_length=72;
1241    &FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1242    open INFILE, "$output_filestem.text.tmp" ||
1243    die "Couldn't open file: $!";
1244    open OUTFILE, ">$output_filestem.text" ||
1245    die "Couldn't open file for writing: $!";
1246    my $line="";
1247    while ($line=<INFILE>) {
1248    while (length($line)>0) {
1249        if (length($line)>$wrap_length) {
1250        $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1251        print OUTFILE "$1\n";
1252        } else {
1253        print OUTFILE "$line";
1254        $line="";
1255        }
1256    }
1257    }
1258    close INFILE;
1259    close OUTFILE;
1260    &FileUtils::removeFiles("$output_filestem.text.tmp");
1261
1262    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1263    return 1;
1264}
1265
1266
1267# Convert any file to HTML with a crude perl implementation of the
1268# UNIX strings command.
1269
1270sub any_to_html {
1271    my ($input_filename, $output_filestem) = @_;
1272
1273    # First generate a text file
1274    return 0 unless (&any_to_text($input_filename, $output_filestem));
1275
1276    # create an HTML file from the text file
1277    open(TEXT, "<$output_filestem.text");
1278    open(HTML, ">$output_filestem.html");
1279
1280    print HTML "<html><head>\n";
1281    print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1282    print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1283    print HTML "</head><body>\n\n";
1284
1285    my $line;
1286    while ($line=<TEXT>) {
1287    $line =~ s/</&lt;/g;
1288    $line =~ s/>/&gt;/g;
1289    if ($line =~ m/^\s*$/) {
1290        print HTML "<p>";
1291    } else {
1292        print HTML "<br> ", $line;
1293    }
1294    }
1295    print HTML "\n</body></html>\n";
1296
1297    close HTML;
1298    close TEXT;
1299
1300    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1301    return 1;
1302}
1303
1304# Convert any file to TEXT with a crude perl implementation of the
1305# UNIX strings command.
1306# Note - this assumes ascii charsets :(     (jrm21)
1307
1308sub any_to_text {
1309    my ($input_filename, $output_filestem) = @_;
1310
1311    if (!$use_strings) {
1312      return 0;
1313    }
1314
1315    print STDERR "\n**** In any to text****\n\n";
1316    open(IN, "<$input_filename") || return 0;
1317    binmode(IN);
1318    open(OUT, ">$output_filestem.text") || return 0;
1319
1320    my ($line);
1321    my $output_line_count = 0;
1322    while (<IN>) {
1323    $line = $_;
1324
1325    # delete anything that isn't a printable character
1326    $line =~ s/[^\040-\176]+/\n/sg;
1327
1328    # delete any string less than 10 characters long
1329    $line =~ s/^.{0,9}$/\n/mg;
1330    while ($line =~ m/^.{1,9}$/m) {
1331        $line =~ s/^.{0,9}$/\n/mg;
1332        $line =~ s/\n+/\n/sg;
1333    }
1334
1335    # remove extraneous whitespace
1336    $line =~ s/\n+/\n/gs;
1337    $line =~ s/^\n//gs;
1338
1339    # output whatever is left
1340    if ($line =~ m/[^\n ]/) {
1341        print OUT $line;
1342        ++$output_line_count;
1343    }
1344    }
1345
1346    close OUT;
1347    close IN;
1348
1349    if ($output_line_count) { # try to protect against binary only formats
1350    return 1;
1351    }
1352
1353    &FileUtils::removeFiles("$output_filestem.text");
1354    return 0;
1355
1356}
Note: See TracBrowser for help on using the browser.