root/main/trunk/greenstone2/bin/script/gsConvert.pl @ 30681

Revision 30681, 36.6 KB (checked in by ak19, 4 years ago)

3 new strings introduced by Kathy contained the :, which is used as a separator in the properties file. Although Kathy tried to escape it with a backslash, it broke GTI because GTI doesn't recognise the backslash as a separator and all kinds of weird things happened from then on, so that the Gujarati translator kept having to translte the current date rather than a real GS3 interface string. Modified the gti.pl code (to be committed) and the new strings that Kathy introduced, so that hopefully, GTI can now handle it. Property names and values will be split at the right-most separator character now (= or :) and any on the left should not be escaped.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs.  The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use strict;
52
53use parsargv;
54use util;
55use FileUtils;
56use Cwd;
57
58# Are we running on WinNT or Win2000 (or later)?
59my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62my $use_strings;
63my $pdf_complex;
64my $pdf_nohidden;
65my $pdf_zoom;
66my $pdf_ignore_images;
67my $pdf_allow_images_only;
68my $windows_scripting;
69
70sub print_usage
71{
72    print STDERR "\n";
73    print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
74    print STDERR "              or text using third-party programs.\n\n";
75    print STDERR "  usage: $0 [options] filename\n";
76    print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
77    print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
78    print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
79    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
80    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
81    print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
82    print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
83    print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
84    print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
85    print STDERR "\t\tconverting PDF to HTML\n";
86    print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
87    print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
88    print STDERR "\t\t-pdf_complex is set\n";
89    exit(1);
90}
91
92my $faillogfile="";
93my $timeout=0;
94my $verbosity=0;
95
96sub main
97{
98    my (@ARGV) = @_;
99    my ($input_type,$output_type,$verbose);
100
101    # Dynamically figure out what the --type option can support, based on whether -windows_scripting
102    # is in use or not
103    my $default_type_re = "(doc|dot|pdf|ps|ppt|rtf|xls)";
104    #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
105    #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
106    # Currently only have VBA for Word and PPT(but no XLS)
107    my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xls)";
108
109    my $type_re = $default_type_re;
110   
111    foreach my $a (@ARGV) {
112        if ($a =~ m/^windows_scripting$/i) {
113            $type_re = $enhanced_type_re;
114        }
115    }
116   
117    # read command-line arguments
118    if (!parsargv::parse(\@ARGV,
119             "type/$type_re/", \$input_type,
120             '/errlog/.*/', \$faillogfile,
121             'output/(auto|html|text|pagedimg).*/', \$output_type,
122             'timeout/\d+/0',\$timeout,
123             'verbose/\d+/0', \$verbose,
124             'windows_scripting',\$windows_scripting,
125             'use_strings', \$use_strings,
126             'pdf_complex', \$pdf_complex,
127             'pdf_ignore_images', \$pdf_ignore_images,
128             'pdf_allow_images_only', \$pdf_allow_images_only,
129             'pdf_nohidden', \$pdf_nohidden,
130             'pdf_zoom/\d+/2', \$pdf_zoom
131             ))
132    {
133    print_usage();
134    }
135
136    $verbosity=$verbose if defined $verbose;
137     
138    # Make sure the input file exists and can be opened for reading
139    if (scalar(@ARGV!=1)) {
140    print_usage();
141    }
142
143    my $input_filename = $ARGV[0];
144    if (!-r $input_filename) {
145    print STDERR "Error: unable to open $input_filename for reading\n";
146    exit(1);
147    }
148
149    # Deduce filenames
150    my ($tailname,$dirname,$suffix)
151    = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
152    my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
153
154    if ($input_type eq "")
155    {
156    $input_type = lc (substr($suffix,1,length($suffix)-1));
157    }
158   
159    # Change to temporary working directory
160    my $stored_dir = cwd();
161    chdir ($dirname) || die "Unable to change to directory $dirname";
162
163    # Select convert utility
164    if (!defined $input_type) {
165    print STDERR "Error: No filename extension or input type defined\n";
166    exit(1);
167    }
168    elsif ($input_type =~ m/^docx?$/ || $input_type eq "dot") {
169    print &convertDOC($input_filename, $output_filestem, $output_type);
170    print "\n";
171    }
172    elsif ($input_type eq "rtf") {
173    print &convertRTF($input_filename, $output_filestem, $output_type);
174    print "\n";
175    }
176    elsif ($input_type eq "pdf") {
177    print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
178    print "\n";
179    }
180    elsif ($input_type eq "ps") {
181    print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
182    print "\n";
183    }
184    elsif ($input_type =~ m/pptx?$/) {
185    print &convertPPT($input_filename, $output_filestem, $output_type);
186    print "\n";
187    }
188    elsif ($input_type =~ m/xlsx?$/) {
189    print &convertXLS($input_filename, $output_filestem, $output_type);
190    print "\n";
191    }
192    else {
193    print STDERR "Error: Unable to convert type '$input_type'\n";
194    exit(1);
195    }
196   
197    # restore to original working directory
198    chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
199
200}
201
202&main(@ARGV);
203
204
205
206# Document-type conversion functions
207#
208# The following functions attempt to convert documents from their
209# input type to the specified output type.  If no output type was
210# given, then they first attempt HTML, and then TEXT.
211#
212# Each returns the output type ("html" or "text") or "fail" if no
213# conversion is possible.
214
215# Convert a Microsoft word document
216
217sub convertDOC {
218    my ($input_filename, $output_filestem, $output_type) = @_;
219
220    # Many .doc files are not in fact word documents!
221    my $realtype = &find_docfile_type($input_filename);
222
223    if ($realtype eq "word6" || $realtype eq "word7"
224        || $realtype eq "word8" || $realtype eq "docx") {
225    return &convertWord678($input_filename, $output_filestem, $output_type);
226    } elsif ($realtype eq "rtf") {
227    return &convertRTF($input_filename, $output_filestem, $output_type);
228    } else {
229    return &convertAnything($input_filename, $output_filestem, $output_type);
230    }
231}
232
233# Convert a Microsoft word 6/7/8 document
234
235sub convertWord678 {
236    my ($input_filename, $output_filestem, $output_type) = @_;
237
238    my $success = 0;
239    if (!$output_type || ($output_type =~ m/html/i)){
240    if ($windows_scripting) {
241        $success = &native_doc_to_html($input_filename, $output_filestem);
242    }
243    else {
244        $success = &doc_to_html($input_filename, $output_filestem);   
245    }
246    if ($success) {
247       return "html";
248    }
249    }
250    return &convertAnything($input_filename, $output_filestem, $output_type);
251}
252
253
254# Convert a Rich Text Format (RTF) file
255
256sub convertRTF {
257    my ($input_filename, $output_filestem, $output_type) = @_;
258
259    my $success = 0;
260
261    # Attempt specialised conversion to HTML
262    if (!$output_type || ($output_type =~ m/html/i)) {
263
264    if ($windows_scripting) {
265        $success = &native_doc_to_html($input_filename, $output_filestem);
266    }
267    else {
268        $success = &rtf_to_html($input_filename, $output_filestem);
269    }
270    if ($success) {
271        return "html";
272    }
273    }
274
275# rtf is so ugly that's it's not worth running strings over.
276# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
277#    return &convertAnything($input_filename, $output_filestem, $output_type);
278    return "fail";
279}
280
281
282# Convert an unidentified file
283
284sub convertAnything {
285    my ($input_filename, $output_filestem, $output_type) = @_;
286   
287    my $success = 0;
288 
289    # Attempt simple conversion to HTML
290    if (!$output_type || ($output_type =~ m/html/i)) {
291    $success = &any_to_html($input_filename, $output_filestem);
292    if ($success) {
293        return "html";
294    }
295    }
296
297    # Convert to text
298    if (!$output_type || ($output_type =~ m/text/i)) {
299    $success = &any_to_text($input_filename, $output_filestem);
300    if ($success) {
301        return "text";
302    }
303    }
304    return "fail";
305}
306
307
308
309# Convert an Adobe PDF document
310
311sub convertPDF {
312    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
313
314    my $success = 0;
315    $output_type =~ s/.*\-(.*)/$1/i;
316    # Attempt coversion to Image
317    if ($output_type =~ m/jp?g|gif|png/i) {
318    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
319    if ($success){
320        return "item";
321    }
322    }
323
324    # Attempt conversion to HTML
325    if (!$output_type || ($output_type =~ m/html/i)) {
326    $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
327    if ($success) {
328        return "html";
329    }
330    }
331
332    # Attempt conversion to TEXT
333    if (!$output_type || ($output_type =~ m/text/i)) {
334    $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
335    if ($success) {
336        return "text";
337    }
338    }
339
340    return "fail";
341
342}
343
344
345# Convert an Adobe PostScript document
346
347sub convertPS {
348    my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
349
350    my $success = 0;
351    $output_type =~ s/.*\-(.*)/$1/i;
352    # Attempt coversion to Image
353    if ($output_type =~ m/jp?g|gif|png/i) {
354    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
355    if ($success){
356        return "item";
357    }
358    }
359
360    # Attempt conversion to TEXT
361    if (!$output_type || ($output_type =~ m/text/i)) {
362    $success = &ps_to_text($input_filename, $output_filestem);
363    if ($success) {
364        return "text";
365    }
366    }
367    return "fail";
368}
369
370
371sub convertPPT {
372    my ($input_filename, $output_filestem, $output_type) = @_;
373    my $success = 0;
374
375    my $ppt_convert_type = "";
376
377    #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
378    if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
379    if ($output_type =~ m/gif/i) {
380        $ppt_convert_type = "-g";
381    } elsif ($output_type =~ m/jp?g/i){
382        $ppt_convert_type = "-j";
383    } elsif ($output_type =~ m/png/i){
384        $ppt_convert_type = "-p";
385    }
386    my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
387                       $ENV{'GSDLOS'}, "pptextract");
388    $vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
389    # $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
390           
391    my $cmd = "";
392    if ($timeout) {$cmd = "ulimit -t $timeout;";}
393    # if the converting directory already exists
394    if (-d $output_filestem) {
395        print STDERR "**The conversion directory already exists\n";
396        return "item";
397    } else {
398        $cmd .=  "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
399        $cmd .= " 2>\"$output_filestem.err\""
400        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
401
402        if (system($cmd) !=0) {
403        print STDERR "Powerpoint VB Scripting convert failed\n";
404        } else {
405        return "item";
406        }
407    }
408    } elsif (!$output_type || ($output_type =~ m/html/i)) {
409    # Attempt conversion to HTML
410    #if (!$output_type || ($output_type =~ m/html/i)) {
411    # formulate the command
412    my $cmd = "";
413    my $full_perl_path = &util::get_perl_exec();
414    $cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
415    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
416    $cmd .= " 2>\"$output_filestem.err\""
417        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
418
419    # execute the command
420    $!=0;
421    if (system($cmd)!=0)
422    {
423        print STDERR "Powerpoint 95/97 converter failed $!\n";
424    } else {
425        return "html";
426    }
427    }
428
429    $success = &any_to_text($input_filename, $output_filestem);
430    if ($success) {
431    return "text";
432    }
433   
434    return "fail";
435}
436
437
438sub convertXLS {
439    my ($input_filename, $output_filestem, $output_type) = @_;
440
441    my $success = 0;
442
443    # Attempt conversion to HTML
444    if (!$output_type || ($output_type =~ m/html/i)) {
445    # formulate the command
446    my $cmd = "";
447    my $full_perl_path = &util::get_perl_exec();
448    $cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
449    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
450    $cmd .= " 2>\"$output_filestem.err\""
451        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
452   
453   
454    # execute the command
455    $!=0;
456    if (system($cmd)!=0)
457    {
458        print STDERR "Excel 95/97 converter failed $!\n";
459    } else {
460        return "html";
461    }
462    }
463
464    $success = &any_to_text($input_filename, $output_filestem);
465    if ($success) {
466    return "text";
467    }
468
469    return "fail";
470}
471
472
473
474# Find the real type of a .doc file
475#
476# We seem to have a lot of files with a .doc extension that are .rtf
477# files or Word 5 files.  This function attempts to tell the difference.
478sub find_docfile_type {
479    my ($input_filename) = @_;
480   
481    if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
482        return "docx";
483    }
484   
485    open(CHK, "<$input_filename");
486    binmode(CHK);
487    my $line = "";
488    my $first = 1;
489
490    while (<CHK>) {
491   
492    $line = $_;
493
494    if ($first) {
495        # check to see if this is an rtf file
496        if ($line =~ m/^\{\\rtf/) {
497        close(CHK);
498        return "rtf";
499        }
500        $first = 0;
501    }
502   
503    # is this is a word 6/7/8 document?
504    if ($line =~ m/Word\.Document\.([678])/) {
505        close(CHK);
506
507        return "word$1";
508    }
509
510    }
511
512    return "unknown";
513}
514
515
516# Specific type-to-type conversions
517#
518# Each of the following functions attempts to convert a document from
519# a specific format to another.  If they succeed they return 1 and leave
520# the output document(s) in the appropriate place; if they fail they
521# return 0 and delete any working files.
522
523
524# Attempt to convert a word document to html with the wv program
525sub doc_to_html {
526    my ($input_filename, $output_filestem) = @_;
527
528    my $wvware_status = 0;
529   
530    # need to ensure that the path to perl is quoted (in case there's spaces in it)
531    my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";   
532
533    print STDERR "***** wvware launch cmd = $launch_cmd\n";
534
535    $wvware_status = system($launch_cmd)/256;
536    return $wvware_status;
537}
538
539# Attempt to convert a word document to html with the word2html scripting program
540sub native_doc_to_html {
541    my ($input_filename, $output_filestem) = @_;
542
543    # build up the path to the doc-to-html conversion tool we're going to use
544    my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
545
546    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
547        # if windows scripting with docx input, use new VBscript to get the local Word install (if
548        # any) to do the conversion, since docX can't be processed by word2html's windows_scripting
549       
550        if($input_filename =~ m/docx$/i) {  # need to use full path to docx2html script,
551                                            # else script launch fails when there are error msgs
552            $vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
553            $vbScript = "CScript //Nologo \"$vbScript\"";   # launch with CScript for error output in STDERR
554                                    # //Nologo flag avoids Microsoft's opening/logo msgs
555            print STDERR "About to use windows scripting to process docx file $input_filename.\n";
556            print STDERR "   This may take some time. Please wait...\n";
557        }
558        else {  # old doc versions. use the usual VB executable word2html for the
559                # conversion. Doesn't need full path, since bin\windows is on PATH         
560            $vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
561        }
562    }
563    else { # not windows
564        $vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
565    }
566
567    if (-e "$output_filestem.html") {
568    print STDERR "    The conversion file:\n";
569    print STDERR "      $output_filestem.html\n";
570    print STDERR "    ... already exists.  Skipping\n";
571    return 1;
572    }
573
574    my $cmd = "";
575    if ($timeout) {$cmd = "ulimit -t $timeout;";}
576    #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
577    #$cmd .=  "$vbScript $input_filename $output_filestem.html";
578    $cmd .=  "$vbScript \"$input_filename\" \"$output_filestem.html\"";
579
580    # redirecting STDERR
581   
582    $cmd .= " 2> \"$output_filestem.err\""
583        if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);   
584    #print STDERR "@@@@@@@@@ cmd=$cmd\n";
585   
586    # execute the command
587    $!=0;
588    if (system($cmd)!=0)
589    {
590    print STDERR "Error executing $vbScript converter:$!\n";
591    if (-s "$output_filestem.err") {
592        open (ERRFILE, "<$output_filestem.err");
593       
594        my $write_to_fail_log=0;
595        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
596        {$write_to_fail_log=1;}
597
598        my $line;
599        while ($line=<ERRFILE>) {
600        if ($line =~ m/\w/) {
601            print STDERR "$line";
602            print FAILLOG "$line" if ($write_to_fail_log);
603        }
604        if ($line !~ m/startup error/) {next;}
605        print STDERR " (given an invalid .DOC file?)\n";
606        print FAILLOG " (given an invalid .DOC file?)\n"
607        if ($write_to_fail_log);
608       
609        } # while ERRFILE
610        close FAILLOG if ($write_to_fail_log);
611    }
612    return 0; # we can try any_to_text
613    }
614
615    # Was the conversion successful?
616    if (-s "$output_filestem.html") {
617    open(TMP, "$output_filestem.html");
618    my $line = <TMP>;
619    close(TMP);
620    if ($line && $line =~ m/html/i) {
621        &FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
622        return 1;
623    }
624    }
625   
626    # If here, an error of some sort occurred
627    &FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
628    if (-e "$output_filestem.err") {
629    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
630        open (ERRLOG,"$output_filestem.err");
631        while (<ERRLOG>) {print FAILLOG $_;}
632        close FAILLOG;
633        close ERRLOG;
634    }
635    &FileUtils::removeFiles("$output_filestem.err");
636    }
637    return 0;
638}
639
640# Attempt to convert an RTF document to html with rtftohtml
641sub rtf_to_html {
642    my ($input_filename, $output_filestem) = @_;
643
644    # formulate the command
645    my $cmd = "";
646    if ($timeout) {$cmd = "ulimit -t $timeout;";}
647    $cmd .= "rtftohtml";
648    #$cmd .= "rtf-converter";
649
650    $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
651
652    $cmd .= " 2>\"$output_filestem.err\""
653        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
654
655
656    # execute the command
657    $!=0;
658    if (system($cmd)!=0)
659    {
660    print STDERR "Error executing rtf converter $!\n";
661    # don't currently bother printing out error log...
662    # keep going, in case it still created an HTML file...
663    }
664
665    # Was the conversion successful?
666    my $was_successful=0;
667    if (-s "$output_filestem.html") {
668    # make sure we have some content other than header
669    open (HTML, "$output_filestem.html"); # what to do if fail?
670    my $line;
671    my $past_header=0;
672    while ($line=<HTML>) {
673
674        if ($past_header == 0) {
675        if ($line =~ m/<body>/) {$past_header=1;}
676        next;
677        }
678
679        $line =~ s/<[^>]+>//g;
680        if ($line =~ m/\w/ && $past_header) {  # we found some content...
681        $was_successful=1;
682        last;
683        }
684    }
685    close HTML;
686    }
687
688    if ($was_successful) {
689    &FileUtils::removeFiles("$output_filestem.err")
690        if (-e "$output_filestem.err");
691    # insert the (modified) table of contents, if it exists.
692    if (-e "${output_filestem}_ToC.html") {
693        &FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
694        my $open_failed=0;
695        open HTMLSRC, "$output_filestem.src" || ++$open_failed;
696        open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
697        open HTML, ">$output_filestem.html" || ++$open_failed;
698       
699        if ($open_failed) {
700        close HTMLSRC;
701        close TOC;
702        close HTML;
703        &FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
704        return 1;
705        }
706
707        # print out header info from src html.
708        while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
709        print HTML "$_";
710        }
711
712        # print out table of contents, making links relative
713        <TOC>; <TOC>; # ignore first 2 lines
714        print HTML scalar(<TOC>); # line 3 = "<ol>\n"
715        my $line;
716        while ($line=<TOC>) {
717        $line =~ s@</body></html>$@@i ; # only last line has this
718        # make link relative
719        $line =~ s@href=\"[^\#]+@href=\"@i;
720        print HTML $line;
721        }
722        close TOC;
723
724        # rest of html src
725        while (<HTMLSRC>) {
726        print HTML $_;
727        }
728        close HTMLSRC;
729        close HTML;
730
731        &FileUtils::removeFiles("${output_filestem}_ToC.html");
732        &FileUtils::removeFiles("${output_filestem}.src");
733    }
734    # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
735    return 1; # success
736    }
737
738    if (-e "$output_filestem.err") {
739    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
740    {
741        print FAILLOG "Error - rtftohtml - couldn't extract text\n";
742        #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
743        print FAILLOG " (rtf file might be too recent):\n";
744        open (ERRLOG, "$output_filestem.err");
745        while (<ERRLOG>) {print FAILLOG $_;}
746        close ERRLOG;
747        close FAILLOG;
748    }
749    &FileUtils::removeFiles("$output_filestem.err");
750    }
751
752    &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
753
754    return 0;
755}
756
757
758# Convert a pdf file to html with the pdftohtml command
759
760sub pdf_to_html {
761    my ($dirname, $input_filename, $output_filestem) = @_;
762
763    my $cmd = "";
764    if ($timeout) {$cmd = "ulimit -t $timeout;";}
765    my $full_perl_path = &util::get_perl_exec();
766    $cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
767    $cmd .= " -c" if ($pdf_complex);
768    $cmd .= " -i" if ($pdf_ignore_images);
769    $cmd .= " -a" if ($pdf_allow_images_only);
770    $cmd .= " -hidden" unless ($pdf_nohidden);
771    $cmd .= " \"$input_filename\" \"$output_filestem\"";
772   
773    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
774    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
775    } else {
776    $cmd .= " > \"$output_filestem.err\"";
777    }
778
779    $!=0;
780
781    my $retval=system($cmd);
782    if ($retval!=0)
783    {
784    print STDERR "Error executing pdftohtml.pl";
785    if ($!) {print STDERR ": $!";}
786    print STDERR "\n";
787    }
788
789    # make sure the converter made something
790    if ($retval!=0 || ! -s "$output_filestem.html")
791    {
792    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
793    # print out the converter's std err, if any
794    if (-s "$output_filestem.err") {
795        open (ERRLOG, "$output_filestem.err") || die "$!";
796        print STDERR "pdftohtml error log:\n";
797        while (<ERRLOG>) {
798        print STDERR "$_";
799        }
800        close ERRLOG;
801    }
802    #print STDERR "***********output filestem $output_filestem.html\n";
803    &FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
804    if (-e "$output_filestem.err") {
805        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
806        {
807        open (ERRLOG, "$output_filestem.err");
808        while (<ERRLOG>) {print FAILLOG $_;}
809        close ERRLOG;
810        close FAILLOG;
811        }   
812        &FileUtils::removeFiles("$output_filestem.err");
813    }
814    return 0;
815    }
816
817    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
818    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
819    return 1;
820}
821
822# Convert a pdf file to various types of image with the convert command
823
824sub pdfps_to_img {
825    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
826
827    # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
828    if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
829    my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
830    $imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
831    my $result = `$imagick_cmd identify 2>&1`;
832
833    # Linux and Windows return different values for "program not found".
834    # Linux returns -1 and Windows 256 for "program not found". But once they're
835    # converted to signed values, it will be -1 for Linux and 1 for Windows.
836    # Whenever we test for return values other than 0, shift by 8 and perform
837    # unsigned to signed status conversion on $? to get expected range of return vals
838    # Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
839    # and then exits on that, by the time we get here, we need to do it again
840    my $status = $?;
841    $status >>= 8;
842    $status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);   
843    if (($ENV{'GSDLOS'} ne "windows" && $status == -1) || ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
844        # if ($status == -1 || $status == 1) #if ($status == -1 || $status == 256) {
845        #ImageMagick is not installed, thus the convert utility is not available.
846        print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
847        return 0;
848    }
849    }
850
851    my $cmd = "";
852    if ($timeout) {$cmd = "ulimit -t $timeout;";}
853    $output_type =~ s/.*\_(.*)/$1/i;
854    my $full_perl_path = &util::get_perl_exec();
855    $cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
856    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
857    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
858    } else {
859    $cmd .= " > \"$output_filestem.err\"";
860    }
861
862    # don't include path on windows (to avoid having to play about
863    # with quoting when GSDLHOME might contain spaces) but assume
864    # that the PATH is set up correctly
865    $!=0;
866    my $retval=system($cmd);
867    if ($retval!=0)
868    {
869    print STDERR "Error executing pdfpstoimg.pl";
870    if ($!) {print STDERR ": $!";}
871    print STDERR "\n";
872    }
873
874    #make sure the converter made something
875    #if ($retval !=0) || ! -s "$output_filestem")
876    if ($retval !=0)
877    {
878    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
879    #print out the converter's std err, if any
880    if (-s "$output_filestem.err") {
881        open (ERRLOG, "$output_filestem.err") || die "$!";
882        print STDERR "pdfpstoimg error log:\n";
883        while (<ERRLOG>) {
884        print STDERR "$_";
885        }
886        close ERRLOG;
887    }
888    #&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
889    if (-e "$output_filestem.err") {
890        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
891        {
892        open (ERRLOG, "$output_filestem.err");
893        while (<ERRLOG>) {print FAILLOG $_;}
894        close ERRLOG;
895        close FAILLOG;
896       }   
897        &FileUtils::removeFiles("$output_filestem.err");
898    }
899    return 0;
900    }
901    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
902    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
903    return 1;
904}
905
906# Convert a PDF file to text with the pdftotext command
907
908sub pdf_to_text {
909    my ($dirname, $input_filename, $output_filestem) = @_;
910
911    my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
912
913    if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
914    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
915    } else {
916    $cmd .= " > \"$output_filestem.err\"";
917    }
918   
919    if (system($cmd)!=0)
920    {
921    print STDERR "Error executing $cmd: $!\n";
922    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
923    }
924
925    # make sure there is some extracted text.
926    if (-e "$output_filestem.text") {
927    open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
928    binmode(EXTR_TEXT); # just in case...
929    my $line="";
930    my $seen_text=0;
931    while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
932        if ($line=~ m/\w/) {$seen_text=1;}
933    }
934    close EXTR_TEXT;
935    if ($seen_text==0) { # no text was extracted
936        print STDERR "Error: pdftotext found no text\n";
937        &FileUtils::removeFiles("$output_filestem.text");
938    }
939    }
940
941    # make sure the converter made something
942    if (! -s "$output_filestem.text")
943    {
944    # print out the converters std err, if any
945    if (-s "$output_filestem.err") {
946        open (ERRLOG, "$output_filestem.err") || die "$!";
947        print STDERR "pdftotext error log:\n";
948        while (<ERRLOG>) {
949        print STDERR "$_";
950        }
951        close ERRLOG;
952    }
953    # does this converter create a .out file?
954    &FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
955    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
956    if (-e "$output_filestem.err") {
957        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
958        {
959        open (ERRLOG,"$output_filestem.err");
960        while (<ERRLOG>) {print FAILLOG $_;}
961        close ERRLOG;
962        close FAILLOG;
963        }
964        &FileUtils::removeFiles("$output_filestem.err");
965    }
966    return 0;
967    }
968    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
969    return 1;
970}
971
972# Convert a PostScript document to text
973# note - just using "ps2ascii" isn't good enough, as it
974# returns 0 for a postscript interpreter error. ps2ascii is just
975# a wrapper to "gs" anyway, so we use that cmd here.
976
977sub ps_to_text {
978    my ($input_filename, $output_filestem) = @_;
979
980    my $error = "";
981
982    # if we're on windows we'll fall straight through without attempting
983    # to use gs
984    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
985    $error = "Windows does not support gs";
986
987    } else {
988    my $cmd = "";
989    if ($timeout) {$cmd = "ulimit -t $timeout; ";}
990    $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
991    $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
992    #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
993    $cmd .= " 2> $output_filestem.err";
994    $!=0;
995
996    my $retcode=system($cmd);
997    $retcode = $? >> 8;  # see man perlfunc - system for this...
998    # if system returns -1 | 127 (couldn't start program), look at $! for message
999
1000    if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1001    elsif (! -e "$output_filestem.text") {
1002        $error="did not create output file.\n";
1003    }
1004    else
1005    {   # make sure the interpreter didn't get an error. It is technically
1006        # possible for the actual text to start with this, but....
1007        open PSOUT, "$output_filestem.text";
1008        if (<PSOUT> =~ m/^Error: (.*)/) {
1009        $error="interpreter error - \"$1\"";
1010        }
1011        close PSOUT;
1012    }
1013    }
1014
1015    if ($error ne "")
1016    {
1017    print STDERR "Warning: Error executing gs: $error\n";
1018    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1019
1020    if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1021    {
1022        print FAILLOG "gs - $error\n";
1023        if (-e "$output_filestem.err") {
1024        open(ERRLOG, "$output_filestem.err");
1025        while (<ERRLOG>) {print FAILLOG $_;}
1026        close ERRLOG;
1027        }
1028        close FAILLOG;
1029    }
1030    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1031
1032
1033    # Fine then. We'll just do a lousy job by ourselves...
1034    # Based on 5-line regexp sed script found at:
1035    # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1036    #
1037    print STDERR "Stripping text from postscript\n";
1038    my $errorcode=0;
1039    open (IN, "$input_filename")
1040        ||  ($errorcode=1, warn "Couldn't read file: $!");
1041    open (OUT, ">$output_filestem.text")
1042        ||  ($errorcode=1, warn "Couldn't write file: $!");
1043    if ($errorcode) {print STDERR "errors\n";return 0;}
1044   
1045    my $text="";  # this is for whole .ps file...
1046    $text = join('', <IN>); # see man perlport, under "System Resources"
1047    close IN;
1048
1049    # Make sure this is a ps file...
1050    if ($text !~ m/^%!/) {
1051        print STDERR "Bad postscript header: not '%!'\n";
1052        if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1053        {
1054        print FAILLOG "Bad postscript header: not '%!'\n";
1055        close FAILLOG;
1056        }
1057        return 0;
1058    }
1059
1060    # if ps has Page data, then use it to delete all stuff before it.
1061    $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1062   
1063    # remove all leading non-data stuff
1064    $text =~ s/^.*?\(//s;
1065
1066    # remove all newline chars for easier processing
1067    $text =~ s/\n//g;
1068   
1069    # Big assumption here - assume that if any co-ordinates are
1070    # given, then we are at the end of a sentence.
1071    $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1072
1073    # special characters--
1074    $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1075
1076    # ? ps text formatting (eg italics?) ?
1077    $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1078    $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1079    $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1080    # default - remove the rest
1081    $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1082
1083    # attempt to add whitespace between words...
1084    # this is based purely on observation, and may be completely wrong...
1085    $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1086    # eg I notice "b(" is sometimes NOT a space if preceded by a
1087    # negative number.
1088    $text =~ s/\)\d+ ?b\(/\) \( /g;
1089
1090    # change quoted braces to brackets
1091    $text =~ s/([^\\])\\\(/$1\{/g;
1092    $text =~ s/([^\\])\\\)/$1\}/g ;
1093
1094    # remove everything that is not between braces
1095    $text =~ s/\)([^\(\)])+?\(//sg ;
1096   
1097    # remove any Trailer eof stuff.
1098    $text =~ s/\)[^\)]*$//sg;
1099
1100    ### ligatures have special characters...
1101    $text =~ s/\\013/ff/g;
1102    $text =~ s/\\014/fi/g;
1103    $text =~ s/\\015/fl/g;
1104    $text =~ s/\\016/ffi/g;
1105    $text =~ s/\\214/fi/g;
1106    $text =~ s/\\215/fl/g;
1107    $text =~ s/\\017/\n\* /g; # asterisk?
1108    $text =~ s/\\023/\023/g;  # e acute ('e)
1109    $text =~ s/\\177/\252/g;  # u"
1110#   $text =~ s/ ?? /\344/g;  # a"
1111
1112    print OUT "$text";
1113    close OUT;
1114    }
1115    # wrap the text - use a minimum length. ie, first space after this length.
1116    my $wrap_length=72;
1117    &FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1118    open INFILE, "$output_filestem.text.tmp" ||
1119    die "Couldn't open file: $!";
1120    open OUTFILE, ">$output_filestem.text" ||
1121    die "Couldn't open file for writing: $!";
1122    my $line="";
1123    while ($line=<INFILE>) {
1124    while (length($line)>0) {
1125        if (length($line)>$wrap_length) {
1126        $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1127        print OUTFILE "$1\n";
1128        } else {
1129        print OUTFILE "$line";
1130        $line="";
1131        }
1132    }
1133    }
1134    close INFILE;
1135    close OUTFILE;
1136    &FileUtils::removeFiles("$output_filestem.text.tmp");
1137
1138    &FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1139    return 1;
1140}
1141
1142
1143# Convert any file to HTML with a crude perl implementation of the
1144# UNIX strings command.
1145
1146sub any_to_html {
1147    my ($input_filename, $output_filestem) = @_;
1148
1149    # First generate a text file
1150    return 0 unless (&any_to_text($input_filename, $output_filestem));
1151
1152    # create an HTML file from the text file
1153    open(TEXT, "<$output_filestem.text");
1154    open(HTML, ">$output_filestem.html");
1155
1156    print HTML "<html><head>\n";
1157    print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1158    print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1159    print HTML "</head><body>\n\n";
1160
1161    my $line;
1162    while ($line=<TEXT>) {
1163    $line =~ s/</&lt;/g;
1164    $line =~ s/>/&gt;/g;
1165    if ($line =~ m/^\s*$/) {
1166        print HTML "<p>";
1167    } else {
1168        print HTML "<br> ", $line;
1169    }
1170    }
1171    print HTML "\n</body></html>\n";
1172
1173    close HTML;
1174    close TEXT;
1175
1176    &FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1177    return 1;
1178}
1179
1180# Convert any file to TEXT with a crude perl implementation of the
1181# UNIX strings command.
1182# Note - this assumes ascii charsets :(     (jrm21)
1183
1184sub any_to_text {
1185    my ($input_filename, $output_filestem) = @_;
1186
1187    if (!$use_strings) {
1188      return 0;
1189    }
1190
1191    print STDERR "\n**** In any to text****\n\n";
1192    open(IN, "<$input_filename") || return 0;
1193    binmode(IN);
1194    open(OUT, ">$output_filestem.text") || return 0;
1195
1196    my ($line);
1197    my $output_line_count = 0;
1198    while (<IN>) {
1199    $line = $_;
1200
1201    # delete anything that isn't a printable character
1202    $line =~ s/[^\040-\176]+/\n/sg;
1203
1204    # delete any string less than 10 characters long
1205    $line =~ s/^.{0,9}$/\n/mg;
1206    while ($line =~ m/^.{1,9}$/m) {
1207        $line =~ s/^.{0,9}$/\n/mg;
1208        $line =~ s/\n+/\n/sg;
1209    }
1210
1211    # remove extraneous whitespace
1212    $line =~ s/\n+/\n/gs;
1213    $line =~ s/^\n//gs;
1214
1215    # output whatever is left
1216    if ($line =~ m/[^\n ]/) {
1217        print OUT $line;
1218        ++$output_line_count;
1219    }
1220    }
1221
1222    close OUT;
1223    close IN;
1224
1225    if ($output_line_count) { # try to protect against binary only formats
1226    return 1;
1227    }
1228
1229    &FileUtils::removeFiles("$output_filestem.text");
1230    return 0;
1231
1232}
Note: See TracBrowser for help on using the browser.