root/main/trunk/greenstone2/bin/script/gsConvert.pl @ 22429

Revision 22429, 48.3 KB (checked in by davidb, 10 years ago)

Support of using OpenOffice? scripting through JODConverter.jar added. Also added in 'use strict' and then fixed up a variety of places that needed 'my' added

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs.  The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use strict;
52
53use parsargv;
54use util;
55use Cwd;
56use File::Basename;
57
58# Are we running on WinNT or Win2000 (or later)?
59my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62my $use_strings;
63my $pdf_complex;
64my $pdf_nohidden;
65my $pdf_zoom;
66my $pdf_ignore_images;
67my $pdf_allow_images_only;
68my $windows_scripting;
69my $openoffice_scripting;
70
71sub print_usage
72{
73    print STDERR "\n";
74    print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
75    print STDERR "              or text using third-party programs.\n\n";
76    print STDERR "  usage: $0 [options] filename\n";
77    if ($openoffice_scripting) {
78    print STDERR "  options:\n\t-type\tdoc|dot|docx|odf|pdf|ps|ppt|rtf|xls\t(input file type)\n";
79    }
80    else {
81    print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
82    }
83    print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
84    print STDERR "\t-output\tauto|html|text|pagedimage_jpg|pagedimage_gif|pagedimage_png\t(output file type)\n";
85    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
86    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
87    print STDERR "\t-windows_scripting\tuse windows script (if available) when converting Microsoft Word and PPT via VB script\n";
88    print STDERR "\t-openoffice_scripting\tuse openoffice script (if available) when converting Microsoft Word and PPT via OpenOffice\n";
89    print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
90    print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
91    print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
92    print STDERR "\t\tconverting PDF to HTML\n";
93    print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
94    print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
95    print STDERR "\t\t-pdf_complex is set\n";
96    exit(1);
97}
98
99my $faillogfile="";
100my $timeout=0;
101
102sub main
103{
104    my (@ARGV) = @_;
105    my ($input_type,$output_type,$verbose);
106
107   
108    # scan for -openoffice_scripting as it effects the permissible
109    # values for -type
110
111    foreach my $a (@ARGV) {
112    if ($a =~ m/^-openoffice_scripting$/) {
113        $openoffice_scripting = 1;
114        last;
115    }
116    }
117
118    my $parse_type;
119    if ($openoffice_scripting) {
120    $parse_type = 'type/(doc|dot|docx|odf|pdf|ps|ppt|rtf|xls)/';
121    }
122    else {
123    $parse_type = 'type/(doc|dot|pdf|ps|ppt|rtf|xls)/';
124    }
125
126    # read command-line arguments
127    if (!parsargv::parse(\@ARGV,
128             $parse_type, \$input_type,
129             '/errlog/.*/', \$faillogfile,
130             'output/(auto|html|text|pagedimage).*/', \$output_type,
131             'timeout/\d+/0',\$timeout,
132             'verbose/\d+/0', \$verbose,
133             'windows_scripting',\$windows_scripting,
134             'openoffice_scripting',\$openoffice_scripting,
135             'use_strings', \$use_strings,
136             'pdf_complex', \$pdf_complex,
137             'pdf_ignore_images', \$pdf_ignore_images,
138             'pdf_allow_images_only', \$pdf_allow_images_only,
139             'pdf_nohidden', \$pdf_nohidden,
140             'pdf_zoom/\d+/2', \$pdf_zoom
141             ))
142    {
143    print_usage();
144    }
145     
146    # Make sure the input file exists and can be opened for reading
147    if (scalar(@ARGV!=1)) {
148    print_usage();
149    }
150
151    my $input_filename = $ARGV[0];
152    if (!-r $input_filename) {
153    print STDERR "Error: unable to open $input_filename for reading\n";
154    exit(1);
155    }
156
157    # Deduce filenames
158    my ($tailname,$dirname,$suffix)
159    = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
160    my $output_filestem = &util::filename_cat($dirname, "$tailname");
161
162    if ($input_type eq "")
163    {
164    $input_type = lc (substr($suffix,1,length($suffix)-1));
165    }
166   
167    # Change to temporary working directory
168    my $stored_dir = cwd();
169    chdir ($dirname) || die "Unable to change to directory $dirname";
170
171    # Select convert utility
172    if (!defined $input_type) {
173    print STDERR "Error: No filename extension or input type defined\n";
174    exit(1);
175    }
176    elsif ($openoffice_scripting && (($input_type eq "docx") || ($input_type eq "odf"))) {
177    print &convertDOC($input_filename, $output_filestem, $output_type);
178    print "\n";
179    }
180    elsif ($input_type eq "doc" || $input_type eq "dot") {
181    print &convertDOC($input_filename, $output_filestem, $output_type);
182    print "\n";
183    }
184    elsif ($input_type eq "rtf") {
185    print &convertRTF($input_filename, $output_filestem, $output_type);
186    print "\n";
187    }
188    elsif ($input_type eq "pdf") {
189    print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
190    print "\n";
191    }
192    elsif ($input_type eq "ps") {
193    print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
194    print "\n";
195    }
196    elsif ($input_type eq "ppt") {
197    print &convertPPT($input_filename, $output_filestem, $output_type);
198    print "\n";
199    }
200    elsif ($input_type eq "xls") {
201    print &convertXLS($input_filename, $output_filestem, $output_type);
202    print "\n";
203    }
204    else {
205    print STDERR "Error: Unable to convert type '$input_type'\n";
206    exit(1);
207    }
208   
209    # restore to original working directory
210    chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
211
212}
213
214&main(@ARGV);
215
216
217
218# Document-type conversion functions
219#
220# The following functions attempt to convert documents from their
221# input type to the specified output type.  If no output type was
222# given, then they first attempt HTML, and then TEXT.
223#
224# Each returns the output type ("html" or "text") or "fail" if no
225# conversion is possible.
226
227# Convert a Microsoft word document
228
229sub convertDOC {
230    my ($input_filename, $output_filestem, $output_type) = @_;
231
232    if (($openoffice_scripting) && ($input_filename =~ m/\.docx?$/i)) {
233    # Jump right in and process with Open Office
234        if (openoffice_doc_to_html($input_filename, $output_filestem)) {
235        return "html";
236    }
237    else {
238        return "fail";
239    }
240    }
241
242    # Many .doc files are not in fact word documents!
243    my $realtype = &find_docfile_type($input_filename);
244
245    if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
246    return &convertWord678($input_filename, $output_filestem, $output_type);
247    } elsif ($realtype eq "rtf") {
248    return &convertRTF($input_filename, $output_filestem, $output_type);
249    } else {
250    return &convertAnything($input_filename, $output_filestem, $output_type);
251    }
252}
253
254# Convert a Microsoft word 6/7/8 document
255
256sub convertWord678 {
257    my ($input_filename, $output_filestem, $output_type) = @_;
258
259    my $success = 0;
260    if (!$output_type || ($output_type =~ m/html/i)){
261    if ($windows_scripting) {
262        $success = &native_doc_to_html($input_filename, $output_filestem);
263    }
264    elsif ($openoffice_scripting) {
265        $success = &openoffice_doc_to_html($input_filename, $output_filestem);
266    }
267    else {
268        $success = &doc_to_html($input_filename, $output_filestem);   
269    }
270    if ($success) {
271       return "html";
272    }
273    }
274    return &convertAnything($input_filename, $output_filestem, $output_type);
275}
276
277
278# Convert a Rich Text Format (RTF) file
279
280sub convertRTF {
281    my ($input_filename, $output_filestem, $output_type) = @_;
282
283    my $success = 0;
284
285    # Attempt specialised conversion to HTML
286    if (!$output_type || ($output_type =~ m/html/i)) {
287
288    if ($windows_scripting) {
289        $success = &native_doc_to_html($input_filename, $output_filestem);
290    }
291    elsif ($openoffice_scripting) {
292        $success = &openoffice_doc_to_html($input_filename, $output_filestem);
293    }
294    else {
295        $success = &rtf_to_html($input_filename, $output_filestem);
296    }
297    if ($success) {
298        return "html";
299    }
300    }
301
302# rtf is so ugly that's it's not worth running strings over.
303# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
304#    return &convertAnything($input_filename, $output_filestem, $output_type);
305    return "fail";
306}
307
308
309# Convert an unidentified file
310
311sub convertAnything {
312    my ($input_filename, $output_filestem, $output_type) = @_;
313   
314    my $success = 0;
315 
316    # Attempt simple conversion to HTML
317    if (!$output_type || ($output_type =~ m/html/i)) {
318    $success = &any_to_html($input_filename, $output_filestem);
319    if ($success) {
320        return "html";
321    }
322    }
323
324    # Convert to text
325    if (!$output_type || ($output_type =~ m/text/i)) {
326    $success = &any_to_text($input_filename, $output_filestem);
327    if ($success) {
328        return "text";
329    }
330    }
331    return "fail";
332}
333
334
335
336# Convert an Adobe PDF document
337
338sub convertPDF {
339    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
340
341    my $success = 0;
342    $output_type =~ s/.*\-(.*)/$1/i;
343    # Attempt coversion to Image
344    if ($output_type =~ m/jp?g|gif|png/i) {
345    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
346    if ($success){
347        return "item";
348    }
349    }
350
351    # Attempt conversion to HTML
352    if (!$output_type || ($output_type =~ m/html/i)) {
353    $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
354    if ($success) {
355        return "html";
356    }
357    }
358
359    # Attempt conversion to TEXT
360    if (!$output_type || ($output_type =~ m/text/i)) {
361    $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
362    if ($success) {
363        return "text";
364    }
365    }
366
367    return "fail";
368
369}
370
371
372# Convert an Adobe PostScript document
373
374sub convertPS {
375    my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
376
377    my $success = 0;
378    $output_type =~ s/.*\-(.*)/$1/i;
379    # Attempt coversion to Image
380    if ($output_type =~ m/jp?g|gif|png/i) {
381    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
382    if ($success){
383        return "item";
384    }
385    }
386
387    # Attempt conversion to TEXT
388    if (!$output_type || ($output_type =~ m/text/i)) {
389    $success = &ps_to_text($input_filename, $output_filestem);
390    if ($success) {
391        return "text";
392    }
393    }
394    return "fail";
395}
396
397
398sub convertPPT {
399    my ($input_filename, $output_filestem, $output_type) = @_;
400    my $success = 0;
401
402    my $ppt_convert_type = "";
403    #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
404    if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
405    if ($output_type =~ m/gif/i) {
406        $ppt_convert_type = "-g";
407    } elsif ($output_type =~ m/jp?g/i){
408        $ppt_convert_type = "-j";
409    } elsif ($output_type =~ m/png/i){
410        $ppt_convert_type = "-p";
411    }
412    my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
413                       $ENV{'GSDLOS'}, "pptextract");
414    $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
415           
416    my $cmd = "";
417    if ($timeout) {$cmd = "ulimit -t $timeout;";}
418    # if the converting directory already exists
419    if (-d $output_filestem) {
420        print STDERR "**The conversion directory already exists\n";
421        return "item";
422    } else {
423        $cmd .=  "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
424        $cmd .= " 2>\"$output_filestem.err\""
425        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
426        if (system($cmd) !=0) {
427        print STDERR "Powerpoint VB Scripting convert failed\n";
428        } else {
429        return "item";
430        }
431    }
432    } elsif (!$output_type || ($output_type =~ m/html/i)) {
433    # Attempt conversion to HTML
434    #if (!$output_type || ($output_type =~ m/html/i)) {
435    # formulate the command
436    my $cmd = "";
437    $cmd .= "perl -S ppttohtml.pl ";
438    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
439    $cmd .= " 2>\"$output_filestem.err\""
440        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
441
442    # execute the command
443    $!=0;
444    if (system($cmd)!=0)
445    {
446        print STDERR "Powerpoint 95/97 converter failed $!\n";
447    } else {
448        return "html";
449    }
450    }
451
452    $success = &any_to_text($input_filename, $output_filestem);
453    if ($success) {
454    return "text";
455    }
456   
457    return "fail";
458}
459
460
461sub convertXLS {
462    my ($input_filename, $output_filestem, $output_type) = @_;
463
464    my $success = 0;
465
466    # Attempt conversion to HTML
467    if (!$output_type || ($output_type =~ m/html/i)) {
468    # formulate the command
469    my $cmd = "";
470    $cmd .= "perl -S xlstohtml.pl ";
471    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
472    $cmd .= " 2>\"$output_filestem.err\""
473        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
474   
475   
476    # execute the command
477    $!=0;
478    if (system($cmd)!=0)
479    {
480        print STDERR "Excel 95/97 converter failed $!\n";
481    } else {
482        return "html";
483    }
484    }
485
486    $success = &any_to_text($input_filename, $output_filestem);
487    if ($success) {
488    return "text";
489    }
490
491    return "fail";
492}
493
494
495
496# Find the real type of a .doc file
497#
498# We seem to have a lot of files with a .doc extension that are .rtf
499# files or Word 5 files.  This function attempts to tell the difference.
500sub find_docfile_type {
501    my ($input_filename) = @_;
502   
503    open(CHK, "<$input_filename");
504    binmode(CHK);
505    my $line = "";
506    my $first = 1;
507
508    while (<CHK>) {
509   
510    $line = $_;
511
512    if ($first) {
513        # check to see if this is an rtf file
514        if ($line =~ m/^\{\\rtf/) {
515        close(CHK);
516        return "rtf";
517        }
518        $first = 0;
519    }
520   
521    # is this is a word 6/7/8 document?
522    if ($line =~ m/Word\.Document\.([678])/) {
523        close(CHK);
524        return "word$1";
525    }
526
527    }
528
529    return "unknown";
530}
531
532
533# Specific type-to-type conversions
534#
535# Each of the following functions attempts to convert a document from
536# a specific format to another.  If they succeed they return 1 and leave
537# the output document(s) in the appropriate place; if they fail they
538# return 0 and delete any working files.
539
540
541# Attempt to convert a word document to html with the wv program
542sub doc_to_html {
543    my ($input_filename, $output_filestem) = @_;
544
545    my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
546
547    if ( -d "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv" && $ENV{'GSDLOS'} eq "linux" ) {
548        $ENV{'PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/bin:$ENV{'PATH'}";
549        $ENV{'LD_LIBRARY_PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/lib:$ENV{'LD_LIBRARY_PATH'}";
550        $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv", "bin", "wvWare");
551    }
552
553    # don't include path on windows (to avoid having to play about
554    # with quoting when GSDLHOME might contain spaces) but assume
555    # that the PATH is set up correctly
556    $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
557
558    my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
559                      "packages", "wv", "wvHtml.xml");
560   
561    # Added the following to work with replace_srcdoc_with_html.pl:
562    # Make wvWare put any associated (image) files of the word doc into
563    # folder docname-without-extention_files. This folder should be at
564    # the same level as the html file generated from the doc.
565    # wvWare will take care of proper interlinking.
566
567    # This step is necessary for replace_srcdoc_with_html.pl which will
568    # move the html and associated files into the import folder. We
569    # want to ensure that the associated files won't overwrite similarly
570    # named items already in import. Hence we put them in a folder first
571    # (to which the html links properly) and that will allow
572    # replace_srcdoc_with_html.pl to move them safely to /import.
573
574    # To do all this, we need to use wvWare's --dir and --basename options
575    # where dir is the full path to the image folder directory and
576    # basename is the full path to the image folder appended to the name
577    # which is to be prepended to every image file:
578    # eg. if the images were to have names like sample0.jpg to sampleN.jpg,
579    # then the basename is "/full/path/to/imgdir/sample".
580    # In this case, basename is the full path to and name of the document.
581    # HOWEVER: basename always takes full path, not relative url, so
582    # the greenstone browser is unable to display the images (absolute paths
583    # cause it to give an "external link" message)
584    # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
585    # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
586    # "added --dir option to wvHtml so that pictures can be placed in
587    # a seperate directory"
588    # "running wvWare through IMP to view word documents as html. It gets
589    # invoked like this:
590    # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
591   
592    # toppath is the folder where html is generated
593    # docname is the name (without extension) of the html to be generated
594    # suffix (extension) is thrown away
595    my ($docname, $toppath)
596    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
597
598    # We want the image folder generated to have the same name as windows
599    # would generate ($windows_scripting) when it converts from word to html.
600    # That is, foldername=docname_files
601    my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
602    #print "assoc_dir: ".$assoc_dir."\n";  # same as "$output_filestem._files"
603   
604    # ensure this image directory exists
605    # if it exists already, just delete and recreate
606    if(-e $assoc_dir) {
607    &util::rm_r($assoc_dir);
608    } 
609    &util::mk_dir($assoc_dir);
610
611    # the images are all going to be called image0, image1,..., imageN
612    my $img_basenames = &util::filename_cat($assoc_dir, $docname);
613   
614    #print STDERR "****toppath: $toppath\n****docname: $docname\n;
615    #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
616    #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
617
618    my $cmd = "";
619    if ($timeout) {$cmd = "ulimit -t $timeout;";}
620    # wvWare's --dir and --basename options for image directory.
621    # Replaced the next line with the *2 lines* following it:
622               # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
623    $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
624    $cmd .= " --charset utf-8 --config \"$wv_conf\"";
625    $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
626
627    # redirecting STDERR is a bad idea on windows 95/98
628    $cmd .= " 2> \"$output_filestem.err\""
629    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
630    # execute the command
631    $!=0;
632    if (system($cmd)!=0)
633    {
634    print STDERR "Error executing wv converter:$!\n";
635    if (-s "$output_filestem.err") {
636        open (ERRFILE, "<$output_filestem.err");
637
638        my $write_to_fail_log=0;
639        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
640        {$write_to_fail_log=1;}
641
642        my $line;
643        while ($line=<ERRFILE>) {
644        if ($line =~ m/\w/) {
645            print STDERR "$line";
646            print FAILLOG "$line" if ($write_to_fail_log);
647        }
648        if ($line !~ m/startup error/) {next;}
649        print STDERR " (given an invalid .DOC file?)\n";
650        print FAILLOG " (given an invalid .DOC file?)\n"
651        if ($write_to_fail_log);
652       
653        } # while ERRFILE
654        close FAILLOG if ($write_to_fail_log);
655    }
656    return 0; # we can try any_to_text
657    }
658
659    # Was the conversion successful?
660
661    if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
662    open(TMP, "$output_filestem.html");
663    my $line = <TMP>;
664    close(TMP);
665    if ($line && $line =~ m/DOCTYPE HTML/) {
666        &util::rm("$output_filestem.err") if -e "$output_filestem.err";   
667
668        # Inserted this code to remove the images directory if it was still empty after
669        # the html was generated (in case there were no images in the word document)
670        if (&util::is_dir_empty($assoc_dir)) {
671        #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n";
672        &util::rm_r($assoc_dir);
673        } else { # there was an image folder (it was generated)
674        # Therefore, the html file generated contains absolute links to the images
675        # Replace them with relative links instead, so the folder can be moved elsewhere
676        &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");   
677        }
678        return 1;
679    }
680    }
681   
682    # If here, an error of some sort occurred
683    &util::rm("$output_filestem.html") if -e "$output_filestem.html";
684    if (-e "$output_filestem.err") {
685    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
686        open (ERRLOG,"$output_filestem.err");
687        while (<ERRLOG>) {print FAILLOG $_;}
688        close FAILLOG;
689        close ERRLOG;
690    }
691    &util::rm("$output_filestem.err");
692    }
693   
694    return 0;
695}
696
697# Method to work with doc_to_html - Word docs might contain images.
698# When such word docs are converted with wvWare, we make it generate a
699# <filename>_files folder with the associated images, while the html file
700# <filename> refers to the images using absolute paths to <filename>_files.
701# This method reads in that html file and replaces all the absolute paths to
702# the images in <filename>_files with the relative paths to the images from
703# that folder. (I.e. with <filename>_files/<imagename.ext>).
704sub make_links_to_assocdir_relative{
705    # toppath is the top-level folder in which the html file we're going to be fixing resides
706    # docname is just the name (without extension) of the html file
707    # html_file is the full path to the html file: /full/path/docname.html
708    # assoc_dir_path is toppath/docname_files
709    # assoc_dirname is the directory name of the folder with associated imgs: docname_files
710    my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
711
712    # 1. Read all the contents of the html into a string
713    # open the original file for reading
714    unless(open(FIN, "<$html_file")) {
715    print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
716    return 0;
717    }
718    # From http://perl.plover.com/local.html
719    # "It's cheaper to read the file all at once, without all the splitting and reassembling.
720    # (Some people call this slurping the file.) Perl has a special feature to support this:
721    # If the $/ variable is undefined, the <...> operator will read the entire file all at once"
722    my $html_contents;
723    {
724    local $/ = undef;        # Read entire file at once
725    $html_contents = <FIN>;  # Now file is read in as one single 'line'
726    }
727    close(FIN); # close the file
728    #print STDERR $html_contents;
729   
730    # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src
731    # values with assoc_dirname
732    # At the end: g means substitute all occurrences (global), while s at the end means treat
733    # all new lines as a regular space. This interacts with g to consider all the lines
734    # together as a single line so that multi-occurrences can be replaced.
735
736    # we can't just replace $assoc_dir_path with $assoc_dir
737    # $assoc_dir_path represents a regular expression that needs to be replaced
738    # if it contains ., -, [, ], or Windows style backslashes in paths  -- which all have special
739    # meaning in Perl regular expressions -- we need to escape these first
740    my $safe_reg_expression = $assoc_dir_path;
741    $safe_reg_expression =~ s/\\/\\\\/g;
742    $safe_reg_expression =~ s/\./\\./g;
743    $safe_reg_expression =~ s/\-/\\-/g;
744    $safe_reg_expression =~ s/\[/\\[/g;
745    $safe_reg_expression =~ s/\]/\\]/g;
746    $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
747
748    # The following regular expression substitution looks for <a or <image, followed by any other
749    # attributes and values until it comes to the FIRST (indicated by ?) href= or src=
750    # followed by " or ' no quotes at all around path, followed by the associated folder's pathname
751    # followed by characters (for the img filename), then finally the optional closing quotes
752    # in " or ' form, followed by any other attributes and values until the first > to end the tag.
753    # The substitution: all the parts preceding associated folder's pathname are retained,
754    # the associated folder path name is replaced by associated folder directory name
755    # and the rest upto and including the closing > tag is retained.
756    # The sg at the end of the pattern match treats all of html_contents as a single line (s)
757    # and performs a global replace (g) meaning that all occurrences that match in that single line
758    # are substituted.
759    $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg;
760               #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
761    # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
762    $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
763
764    #print STDERR "****assoc_dirname: $assoc_dirname***\n";
765    #print STDERR "****safe_reg_expression: $safe_reg_expression***\n";
766   
767    # delete the original file and recreate it
768    my $copy_of_filename = $html_file;
769    &util::rm($copy_of_filename); # deleted the file
770
771    # Recreate the original file for writing the updated contents
772    unless(open(FOUT, ">$html_file")) {  # open it as a new file for writing
773    print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
774    return 0;
775    }
776
777    # write out the updated contents and close the file
778    print FOUT $html_contents;
779    close(FOUT);
780    return 1;
781}
782
783# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
784# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
785# introduced in link pathnames by wvWare into space again. Converts all percent signs
786# introduced by URL encoding filenames generated into %25 in these url links referencing them
787sub post_process_assocfile_urls
788{
789    my ($pre, $text, $post) = @_;
790
791    $text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
792    # $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
793    $text =~ s/\\/\//g;
794    $text =~ s/%/%25/g;
795
796    return "$pre$text$post";
797}
798
799# Attempt to convert a word document to html with the word2html scripting program
800sub native_doc_to_html {
801    my ($input_filename, $output_filestem) = @_;
802
803    my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
804                       $ENV{'GSDLOS'}, "word2html");
805
806    $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
807    if (-e "$output_filestem.html") {
808    print STDERR "    The conversion file:\n";
809    print STDERR "      $output_filestem.html\n";
810    print STDERR "    ... already exists.  Skipping\n";
811    return 1;
812    }
813
814    my $cmd = "";
815    if ($timeout) {$cmd = "ulimit -t $timeout;";}
816    #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
817    #$cmd .=  "$vbScript $input_filename $output_filestem.html";
818    $cmd .=  "$vbScript \"$input_filename\" \"$output_filestem.html\"";
819
820    # redirecting STDERR
821    $cmd .= " 2> \"$output_filestem.err\""
822    if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
823   
824    # execute the command
825    $!=0;
826    if (system($cmd)!=0)
827    {
828    print STDERR "Error executing word2Html converter:$!\n";
829    if (-s "$output_filestem.err") {
830        open (ERRFILE, "<$output_filestem.err");
831       
832        my $write_to_fail_log=0;
833        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
834        {$write_to_fail_log=1;}
835
836        my $line;
837        while ($line=<ERRFILE>) {
838        if ($line =~ m/\w/) {
839            print STDERR "$line";
840            print FAILLOG "$line" if ($write_to_fail_log);
841        }
842        if ($line !~ m/startup error/) {next;}
843        print STDERR " (given an invalid .DOC file?)\n";
844        print FAILLOG " (given an invalid .DOC file?)\n"
845        if ($write_to_fail_log);
846       
847        } # while ERRFILE
848        close FAILLOG if ($write_to_fail_log);
849    }
850    return 0; # we can try any_to_text
851    }
852
853    # Was the conversion successful?
854    if (-s "$output_filestem.html") {
855    open(TMP, "$output_filestem.html");
856    my $line = <TMP>;
857    close(TMP);
858    if ($line && $line =~ m/html/i) {
859        &util::rm("$output_filestem.err") if -e "$output_filestem.err";
860        return 1;
861    }
862    }
863   
864    # If here, an error of some sort occurred
865    &util::rm("$output_filestem.html") if -e "$output_filestem.html";
866    if (-e "$output_filestem.err") {
867    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
868        open (ERRLOG,"$output_filestem.err");
869        while (<ERRLOG>) {print FAILLOG $_;}
870        close FAILLOG;
871        close ERRLOG;
872    }
873    &util::rm("$output_filestem.err");
874    }
875    return 0;
876}
877
878# Attempt to convert a word document to html with JODConvert scripting program
879sub openoffice_doc_to_html {
880    my ($input_filename, $output_filestem) = @_;
881
882    if (-e "$output_filestem.html") {
883    print STDERR "    The conversion file:\n";
884    print STDERR "      $output_filestem.html\n";
885    print STDERR "    ... skipping\n";
886    return 1;
887    }
888
889    my $oo_script_dir = &util::filename_cat($ENV{'GEXT_OPENOFFICE'}, "bin", "script");
890    my $oo2html = &util::filename_cat($oo_script_dir,"oo2html");
891    if (!-e $oo2html) {
892    print STDERR "Error: Unable to find 'oo2html' in: \n";
893    print STDERR "       $oo_script_dir\n";
894    print STDERR "       Is the OpenOffice extension to Greenstone installed?\n";
895    return 0;
896    }
897
898    my $cmd = "";
899    if ($timeout) {$cmd = "ulimit -t $timeout;";}
900    $cmd .=  "$oo2html \"$input_filename\" \"$output_filestem.html\"";
901
902    # redirecting STDERR
903    $cmd .= " 2> \"$output_filestem.err\""
904    if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
905   
906    # execute the command
907    $!=0;
908    if (system($cmd)!=0)
909    {
910    print STDERR "Error executing oo2html converter: $!\n";
911    print STDERR "Command was: $cmd\n";
912
913    if (-s "$output_filestem.err") {
914        open (ERRFILE, "<$output_filestem.err");
915       
916        my $write_to_fail_log=0;
917        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
918        {$write_to_fail_log=1;}
919
920        my $line;
921        while ($line=<ERRFILE>) {
922        if ($line =~ m/\w/) {
923            print STDERR "$line";
924            print FAILLOG "$line" if ($write_to_fail_log);
925        }
926        if ($line !~ m/startup error/) {next;}
927        print STDERR " (given an invalid .DOC file?)\n";
928        print FAILLOG " (given an invalid .DOC file?)\n"
929        if ($write_to_fail_log);
930       
931        } # while ERRFILE
932        close FAILLOG if ($write_to_fail_log);
933    }
934    return 0; # we can try any_to_text
935    }
936
937    # Was the conversion successful?
938    if (-s "$output_filestem.html") {
939    open(TMP, "$output_filestem.html");
940    my $line = <TMP>;
941    close(TMP);
942    if ($line && $line =~ m/html/i) {
943        &util::rm("$output_filestem.err") if -e "$output_filestem.err";
944        return 1;
945    }
946    }
947   
948    # If here, an error of some sort occurred
949   
950    &util::rm("$output_filestem.html") if -e "$output_filestem.html";
951    if (-e "$output_filestem.err") {
952    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
953        open (ERRLOG,"$output_filestem.err");
954        while (<ERRLOG>) {print FAILLOG $_;}
955        close FAILLOG;
956        close ERRLOG;
957    }
958    &util::rm("$output_filestem.err");
959    }
960    return 0;
961}
962
963# Attempt to convert an RTF document to html with rtftohtml
964sub rtf_to_html {
965    my ($input_filename, $output_filestem) = @_;
966
967    # formulate the command
968    my $cmd = "";
969    if ($timeout) {$cmd = "ulimit -t $timeout;";}
970    $cmd .= "rtftohtml";
971    #$cmd .= "rtf-converter";
972
973    $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
974
975    $cmd .= " 2>\"$output_filestem.err\""
976        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
977
978
979    # execute the command
980    $!=0;
981    if (system($cmd)!=0)
982    {
983    print STDERR "Error executing rtf converter $!\n";
984    # don't currently bother printing out error log...
985    # keep going, in case it still created an HTML file...
986    }
987
988    # Was the conversion successful?
989    my $was_successful=0;
990    if (-s "$output_filestem.html") {
991    # make sure we have some content other than header
992    open (HTML, "$output_filestem.html"); # what to do if fail?
993    my $line;
994    my $past_header=0;
995    while ($line=<HTML>) {
996
997        if ($past_header == 0) {
998        if ($line =~ m/<body>/) {$past_header=1;}
999        next;
1000        }
1001
1002        $line =~ s/<[^>]+>//g;
1003        if ($line =~ m/\w/ && $past_header) {  # we found some content...
1004        $was_successful=1;
1005        last;
1006        }
1007    }
1008    close HTML;
1009    }
1010
1011    if ($was_successful) {
1012    &util::rm("$output_filestem.err")
1013        if (-e "$output_filestem.err");
1014    # insert the (modified) table of contents, if it exists.
1015    if (-e "${output_filestem}_ToC.html") {
1016        &util::mv("$output_filestem.html","$output_filestem.src");
1017        my $open_failed=0;
1018        open HTMLSRC, "$output_filestem.src" || ++$open_failed;
1019        open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
1020        open HTML, ">$output_filestem.html" || ++$open_failed;
1021       
1022        if ($open_failed) {
1023        close HTMLSRC;
1024        close TOC;
1025        close HTML;
1026        &util::mv("$output_filestem.src","$output_filestem.html");
1027        return 1;
1028        }
1029
1030        # print out header info from src html.
1031        while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
1032        print HTML "$_";
1033        }
1034
1035        # print out table of contents, making links relative
1036        <TOC>; <TOC>; # ignore first 2 lines
1037        print HTML scalar(<TOC>); # line 3 = "<ol>\n"
1038        my $line;
1039        while ($line=<TOC>) {
1040        $line =~ s@</body></html>$@@i ; # only last line has this
1041        # make link relative
1042        $line =~ s@href=\"[^\#]+@href=\"@i;
1043        print HTML $line;
1044        }
1045        close TOC;
1046
1047        # rest of html src
1048        while (<HTMLSRC>) {
1049        print HTML $_;
1050        }
1051        close HTMLSRC;
1052        close HTML;
1053
1054        &util::rm("${output_filestem}_ToC.html");
1055        &util::rm("${output_filestem}.src");
1056    }
1057    # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
1058    return 1; # success
1059    }
1060
1061    if (-e "$output_filestem.err") {
1062    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1063    {
1064        print FAILLOG "Error - rtftohtml - couldn't extract text\n";
1065        #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
1066        print FAILLOG " (rtf file might be too recent):\n";
1067        open (ERRLOG, "$output_filestem.err");
1068        while (<ERRLOG>) {print FAILLOG $_;}
1069        close ERRLOG;
1070        close FAILLOG;
1071    }
1072    &util::rm("$output_filestem.err");
1073    }
1074
1075    &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1076
1077    return 0;
1078}
1079
1080
1081# Convert a pdf file to html with the pdftohtml command
1082
1083sub pdf_to_html {
1084    my ($dirname, $input_filename, $output_filestem) = @_;
1085
1086    my $cmd = "";
1087    if ($timeout) {$cmd = "ulimit -t $timeout;";}
1088    $cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
1089    $cmd .= " -c" if ($pdf_complex);
1090    $cmd .= " -i" if ($pdf_ignore_images);
1091    $cmd .= " -a" if ($pdf_allow_images_only);
1092    $cmd .= " -hidden" unless ($pdf_nohidden);
1093    $cmd .= " \"$input_filename\" \"$output_filestem\"";
1094   
1095    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
1096    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1097    } else {
1098    $cmd .= " > \"$output_filestem.err\"";
1099    }
1100
1101    $!=0;
1102
1103    my $retval=system($cmd);
1104    if ($retval!=0)
1105    {
1106    print STDERR "Error executing pdftohtml.pl";
1107    if ($!) {print STDERR ": $!";}
1108    print STDERR "\n";
1109    }
1110
1111    # make sure the converter made something
1112    if ($retval!=0 || ! -s "$output_filestem.html")
1113    {
1114    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1115    # print out the converter's std err, if any
1116    if (-s "$output_filestem.err") {
1117        open (ERRLOG, "$output_filestem.err") || die "$!";
1118        print STDERR "pdftohtml error log:\n";
1119        while (<ERRLOG>) {
1120        print STDERR "$_";
1121        }
1122        close ERRLOG;
1123    }
1124    &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1125    if (-e "$output_filestem.err") {
1126        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1127        {
1128        open (ERRLOG, "$output_filestem.err");
1129        while (<ERRLOG>) {print FAILLOG $_;}
1130        close ERRLOG;
1131        close FAILLOG;
1132        }   
1133        &util::rm("$output_filestem.err");
1134    }
1135    return 0;
1136    }
1137
1138    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1139    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1140    return 1;
1141}
1142
1143# Convert a pdf file to various types of image with the convert command
1144
1145sub pdfps_to_img {
1146    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
1147
1148    # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
1149    if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
1150    my $result = `identify 2>&1`;
1151    if ($? == -1 || $? == 256) {  # Linux and Windows return different values for "program not found"
1152        #ImageMagick is not installed, thus the convert utility is not available.
1153        print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
1154        return 0;
1155    }
1156    }
1157
1158    my $cmd = "";
1159    if ($timeout) {$cmd = "ulimit -t $timeout;";}
1160    $output_type =~ s/.*\_(.*)/$1/i;
1161    $cmd .= "perl -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
1162    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
1163    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1164    } else {
1165    $cmd .= " > \"$output_filestem.err\"";
1166    }
1167
1168    # don't include path on windows (to avoid having to play about
1169    # with quoting when GSDLHOME might contain spaces) but assume
1170    # that the PATH is set up correctly
1171    $!=0;
1172    my $retval=system($cmd);
1173    if ($retval!=0)
1174    {
1175    print STDERR "Error executing pdftoimg.pl";
1176    if ($!) {print STDERR ": $!";}
1177    print STDERR "\n";
1178    }
1179
1180    #make sure the converter made something
1181    #if ($retval !=0) || ! -s "$output_filestem")
1182    if ($retval !=0)
1183    {
1184    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1185    #print out the converter's std err, if any
1186    if (-s "$output_filestem.err") {
1187        open (ERRLOG, "$output_filestem.err") || die "$!";
1188        print STDERR "pdfpstoimg error log:\n";
1189        while (<ERRLOG>) {
1190        print STDERR "$_";
1191        }
1192        close ERRLOG;
1193    }
1194    #&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1195    if (-e "$output_filestem.err") {
1196        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1197        {
1198        open (ERRLOG, "$output_filestem.err");
1199        while (<ERRLOG>) {print FAILLOG $_;}
1200        close ERRLOG;
1201        close FAILLOG;
1202       }   
1203        &util::rm("$output_filestem.err");
1204    }
1205    return 0;
1206    }
1207    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1208    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1209    return 1;
1210}
1211
1212# Convert a PDF file to text with the pdftotext command
1213
1214sub pdf_to_text {
1215    my ($dirname, $input_filename, $output_filestem) = @_;
1216
1217    my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1218
1219    if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1220    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1221    } else {
1222    $cmd .= " > \"$output_filestem.err\"";
1223    }
1224   
1225    if (system($cmd)!=0)
1226    {
1227    print STDERR "Error executing $cmd: $!\n";
1228    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1229    }
1230
1231    # make sure there is some extracted text.
1232    if (-e "$output_filestem.text") {
1233    open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
1234    binmode(EXTR_TEXT); # just in case...
1235    my $line="";
1236    my $seen_text=0;
1237    while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1238        if ($line=~ m/\w/) {$seen_text=1;}
1239    }
1240    close EXTR_TEXT;
1241    if ($seen_text==0) { # no text was extracted
1242        print STDERR "Error: pdftotext found no text\n";
1243        &util::rm("$output_filestem.text");
1244    }
1245    }
1246
1247    # make sure the converter made something
1248    if (! -s "$output_filestem.text")
1249    {
1250    # print out the converters std err, if any
1251    if (-s "$output_filestem.err") {
1252        open (ERRLOG, "$output_filestem.err") || die "$!";
1253        print STDERR "pdftotext error log:\n";
1254        while (<ERRLOG>) {
1255        print STDERR "$_";
1256        }
1257        close ERRLOG;
1258    }
1259    # does this converter create a .out file?
1260    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1261    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1262    if (-e "$output_filestem.err") {
1263        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1264        {
1265        open (ERRLOG,"$output_filestem.err");
1266        while (<ERRLOG>) {print FAILLOG $_;}
1267        close ERRLOG;
1268        close FAILLOG;
1269        }
1270        &util::rm("$output_filestem.err");
1271    }
1272    return 0;
1273    }
1274    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1275    return 1;
1276}
1277
1278# Convert a PostScript document to text
1279# note - just using "ps2ascii" isn't good enough, as it
1280# returns 0 for a postscript interpreter error. ps2ascii is just
1281# a wrapper to "gs" anyway, so we use that cmd here.
1282
1283sub ps_to_text {
1284    my ($input_filename, $output_filestem) = @_;
1285
1286    my $error = "";
1287
1288    # if we're on windows we'll fall straight through without attempting
1289    # to use gs
1290    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1291    $error = "Windows does not support gs";
1292
1293    } else {
1294    my $cmd = "";
1295    if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1296    $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1297    $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1298    #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1299    $cmd .= " 2> $output_filestem.err";
1300    $!=0;
1301
1302    my $retcode=system($cmd);
1303    $retcode = $? >> 8;  # see man perlfunc - system for this...
1304    # if system returns -1 | 127 (couldn't start program), look at $! for message
1305
1306    if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1307    elsif (! -e "$output_filestem.text") {
1308        $error="did not create output file.\n";
1309    }
1310    else
1311    {   # make sure the interpreter didn't get an error. It is technically
1312        # possible for the actual text to start with this, but....
1313        open PSOUT, "$output_filestem.text";
1314        if (<PSOUT> =~ m/^Error: (.*)/) {
1315        $error="interpreter error - \"$1\"";
1316        }
1317        close PSOUT;
1318    }
1319    }
1320
1321    if ($error ne "")
1322    {
1323    print STDERR "Warning: Error executing gs: $error\n";
1324    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1325
1326    if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1327    {
1328        print FAILLOG "gs - $error\n";
1329        if (-e "$output_filestem.err") {
1330        open(ERRLOG, "$output_filestem.err");
1331        while (<ERRLOG>) {print FAILLOG $_;}
1332        close ERRLOG;
1333        }
1334        close FAILLOG;
1335    }
1336    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1337
1338
1339    # Fine then. We'll just do a lousy job by ourselves...
1340    # Based on 5-line regexp sed script found at:
1341    # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1342    #
1343    print STDERR "Stripping text from postscript\n";
1344    my $errorcode=0;
1345    open (IN, "$input_filename")
1346        ||  ($errorcode=1, warn "Couldn't read file: $!");
1347    open (OUT, ">$output_filestem.text")
1348        ||  ($errorcode=1, warn "Couldn't write file: $!");
1349    if ($errorcode) {print STDERR "errors\n";return 0;}
1350   
1351    my $text="";  # this is for whole .ps file...
1352    $text = join('', <IN>); # see man perlport, under "System Resources"
1353    close IN;
1354
1355    # Make sure this is a ps file...
1356    if ($text !~ m/^%!/) {
1357        print STDERR "Bad postscript header: not '%!'\n";
1358        if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1359        {
1360        print FAILLOG "Bad postscript header: not '%!'\n";
1361        close FAILLOG;
1362        }
1363        return 0;
1364    }
1365
1366    # if ps has Page data, then use it to delete all stuff before it.
1367    $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1368   
1369    # remove all leading non-data stuff
1370    $text =~ s/^.*?\(//s;
1371
1372    # remove all newline chars for easier processing
1373    $text =~ s/\n//g;
1374   
1375    # Big assumption here - assume that if any co-ordinates are
1376    # given, then we are at the end of a sentence.
1377    $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1378
1379    # special characters--
1380    $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1381
1382    # ? ps text formatting (eg italics?) ?
1383    $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1384    $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1385    $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1386    # default - remove the rest
1387    $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1388
1389    # attempt to add whitespace between words...
1390    # this is based purely on observation, and may be completely wrong...
1391    $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1392    # eg I notice "b(" is sometimes NOT a space if preceded by a
1393    # negative number.
1394    $text =~ s/\)\d+ ?b\(/\) \( /g;
1395
1396    # change quoted braces to brackets
1397    $text =~ s/([^\\])\\\(/$1\{/g;
1398    $text =~ s/([^\\])\\\)/$1\}/g ;
1399
1400    # remove everything that is not between braces
1401    $text =~ s/\)([^\(\)])+?\(//sg ;
1402   
1403    # remove any Trailer eof stuff.
1404    $text =~ s/\)[^\)]*$//sg;
1405
1406    ### ligatures have special characters...
1407    $text =~ s/\\013/ff/g;
1408    $text =~ s/\\014/fi/g;
1409    $text =~ s/\\015/fl/g;
1410    $text =~ s/\\016/ffi/g;
1411    $text =~ s/\\214/fi/g;
1412    $text =~ s/\\215/fl/g;
1413    $text =~ s/\\017/\n\* /g; # asterisk?
1414    $text =~ s/\\023/\023/g;  # e acute ('e)
1415    $text =~ s/\\177/\252/g;  # u"
1416#   $text =~ s/ ?? /\344/g;  # a"
1417
1418    print OUT "$text";
1419    close OUT;
1420    }
1421    # wrap the text - use a minimum length. ie, first space after this length.
1422    my $wrap_length=72;
1423    &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1424    open INFILE, "$output_filestem.text.tmp" ||
1425    die "Couldn't open file: $!";
1426    open OUTFILE, ">$output_filestem.text" ||
1427    die "Couldn't open file for writing: $!";
1428    my $line="";
1429    while ($line=<INFILE>) {
1430    while (length($line)>0) {
1431        if (length($line)>$wrap_length) {
1432        $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1433        print OUTFILE "$1\n";
1434        } else {
1435        print OUTFILE "$line";
1436        $line="";
1437        }
1438    }
1439    }
1440    close INFILE;
1441    close OUTFILE;
1442    &util::rm("$output_filestem.text.tmp");
1443
1444    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1445    return 1;
1446}
1447
1448
1449# Convert any file to HTML with a crude perl implementation of the
1450# UNIX strings command.
1451
1452sub any_to_html {
1453    my ($input_filename, $output_filestem) = @_;
1454
1455    # First generate a text file
1456    return 0 unless (&any_to_text($input_filename, $output_filestem));
1457
1458    # create an HTML file from the text file
1459    open(TEXT, "<$output_filestem.text");
1460    open(HTML, ">$output_filestem.html");
1461
1462    print HTML "<html><head>\n";
1463    print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1464    print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1465    print HTML "</head><body>\n\n";
1466
1467    my $line;
1468    while ($line=<TEXT>) {
1469    $line =~ s/</&lt;/g;
1470    $line =~ s/>/&gt;/g;
1471    if ($line =~ m/^\s*$/) {
1472        print HTML "<p>";
1473    } else {
1474        print HTML "<br> ", $line;
1475    }
1476    }
1477    print HTML "\n</body></html>\n";
1478
1479    close HTML;
1480    close TEXT;
1481
1482    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1483    return 1;
1484}
1485
1486# Convert any file to TEXT with a crude perl implementation of the
1487# UNIX strings command.
1488# Note - this assumes ascii charsets :(     (jrm21)
1489
1490sub any_to_text {
1491    my ($input_filename, $output_filestem) = @_;
1492
1493    if (!$use_strings) {
1494      return 0;
1495    }
1496
1497    print STDERR "\n**** In any to text****\n\n";
1498    open(IN, "<$input_filename") || return 0;
1499    binmode(IN);
1500    open(OUT, ">$output_filestem.text") || return 0;
1501
1502    my ($line);
1503    my $output_line_count = 0;
1504    while (<IN>) {
1505    $line = $_;
1506
1507    # delete anything that isn't a printable character
1508    $line =~ s/[^\040-\176]+/\n/sg;
1509
1510    # delete any string less than 10 characters long
1511    $line =~ s/^.{0,9}$/\n/mg;
1512    while ($line =~ m/^.{1,9}$/m) {
1513        $line =~ s/^.{0,9}$/\n/mg;
1514        $line =~ s/\n+/\n/sg;
1515    }
1516
1517    # remove extraneous whitespace
1518    $line =~ s/\n+/\n/gs;
1519    $line =~ s/^\n//gs;
1520
1521    # output whatever is left
1522    if ($line =~ m/[^\n ]/) {
1523        print OUT $line;
1524        ++$output_line_count;
1525    }
1526    }
1527
1528    close OUT;
1529    close IN;
1530
1531    if ($output_line_count) { # try to protect against binary only formats
1532    return 1;
1533    }
1534
1535    &util::rm("$output_filestem.text");
1536    return 0;
1537
1538}
Note: See TracBrowser for help on using the browser.