root/main/trunk/greenstone2/bin/script/gsConvert.pl @ 24124

Revision 24124, 45.7 KB (checked in by ak19, 8 years ago)

Sam worked out that pdftohtml was being launched without quotes around the perl path used to run it. There wre more places that required the quotes. Professor Witten discovered the problem with the CD binary.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs.  The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49
50    if(!$ENV{'PERLPATH'}) {
51    my $full_perl_exec = $^X;
52    require File::Basename;
53    my $perl_path = File::Basename::dirname($full_perl_exec);
54    $ENV{'PERLPATH'} = $perl_path;
55    }
56
57}
58
59use strict;
60
61use parsargv;
62use util;
63use Cwd;
64
65# Are we running on WinNT or Win2000 (or later)?
66my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
67if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
68
69my $use_strings;
70my $pdf_complex;
71my $pdf_nohidden;
72my $pdf_zoom;
73my $pdf_ignore_images;
74my $pdf_allow_images_only;
75my $windows_scripting;
76
77sub print_usage
78{
79    print STDERR "\n";
80    print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
81    print STDERR "              or text using third-party programs.\n\n";
82    print STDERR "  usage: $0 [options] filename\n";
83    print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
84    print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
85    print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
86    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
87    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
88    print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
89    print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
90    print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
91    print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
92    print STDERR "\t\tconverting PDF to HTML\n";
93    print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
94    print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
95    print STDERR "\t\t-pdf_complex is set\n";
96    exit(1);
97}
98
99my $faillogfile="";
100my $timeout=0;
101
102sub main
103{
104    my (@ARGV) = @_;
105    my ($input_type,$output_type,$verbose);
106
107    # Dynamically figure out what the --type option can support, based on whether -windows_scripting
108    # is in use or not
109    my $default_type_re = "(doc|dot|pdf|ps|ppt|rtf|xls)";
110    #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
111    #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
112    # Currently only have VBA for Word and PPT(but no XLS)
113    my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xls)";
114
115    my $type_re = $default_type_re;
116   
117    foreach my $a (@ARGV) {
118        if ($a =~ m/^windows_scripting$/i) {
119            $type_re = $enhanced_type_re;
120        }
121    }
122   
123    # read command-line arguments
124    if (!parsargv::parse(\@ARGV,
125             "type/$type_re/", \$input_type,
126             '/errlog/.*/', \$faillogfile,
127             'output/(auto|html|text|pagedimg).*/', \$output_type,
128             'timeout/\d+/0',\$timeout,
129             'verbose/\d+/0', \$verbose,
130             'windows_scripting',\$windows_scripting,
131             'use_strings', \$use_strings,
132             'pdf_complex', \$pdf_complex,
133             'pdf_ignore_images', \$pdf_ignore_images,
134             'pdf_allow_images_only', \$pdf_allow_images_only,
135             'pdf_nohidden', \$pdf_nohidden,
136             'pdf_zoom/\d+/2', \$pdf_zoom
137             ))
138    {
139    print_usage();
140    }
141     
142    # Make sure the input file exists and can be opened for reading
143    if (scalar(@ARGV!=1)) {
144    print_usage();
145    }
146
147    my $input_filename = $ARGV[0];
148    if (!-r $input_filename) {
149    print STDERR "Error: unable to open $input_filename for reading\n";
150    exit(1);
151    }
152
153    # Deduce filenames
154    my ($tailname,$dirname,$suffix)
155    = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
156    my $output_filestem = &util::filename_cat($dirname, "$tailname");
157
158    if ($input_type eq "")
159    {
160    $input_type = lc (substr($suffix,1,length($suffix)-1));
161    }
162   
163    # Change to temporary working directory
164    my $stored_dir = cwd();
165    chdir ($dirname) || die "Unable to change to directory $dirname";
166
167    # Select convert utility
168    if (!defined $input_type) {
169    print STDERR "Error: No filename extension or input type defined\n";
170    exit(1);
171    }
172    elsif ($input_type =~ m/^docx?$/ || $input_type eq "dot") {
173    print &convertDOC($input_filename, $output_filestem, $output_type);
174    print "\n";
175    }
176    elsif ($input_type eq "rtf") {
177    print &convertRTF($input_filename, $output_filestem, $output_type);
178    print "\n";
179    }
180    elsif ($input_type eq "pdf") {
181    print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
182    print "\n";
183    }
184    elsif ($input_type eq "ps") {
185    print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
186    print "\n";
187    }
188    elsif ($input_type =~ m/pptx?$/) {
189    print &convertPPT($input_filename, $output_filestem, $output_type);
190    print "\n";
191    }
192    elsif ($input_type =~ m/xlsx?$/) {
193    print &convertXLS($input_filename, $output_filestem, $output_type);
194    print "\n";
195    }
196    else {
197    print STDERR "Error: Unable to convert type '$input_type'\n";
198    exit(1);
199    }
200   
201    # restore to original working directory
202    chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
203
204}
205
206&main(@ARGV);
207
208
209
210# Document-type conversion functions
211#
212# The following functions attempt to convert documents from their
213# input type to the specified output type.  If no output type was
214# given, then they first attempt HTML, and then TEXT.
215#
216# Each returns the output type ("html" or "text") or "fail" if no
217# conversion is possible.
218
219# Convert a Microsoft word document
220
221sub convertDOC {
222    my ($input_filename, $output_filestem, $output_type) = @_;
223
224    # Many .doc files are not in fact word documents!
225    my $realtype = &find_docfile_type($input_filename);
226
227    if ($realtype eq "word6" || $realtype eq "word7"
228        || $realtype eq "word8" || $realtype eq "docx") {
229    return &convertWord678($input_filename, $output_filestem, $output_type);
230    } elsif ($realtype eq "rtf") {
231    return &convertRTF($input_filename, $output_filestem, $output_type);
232    } else {
233    return &convertAnything($input_filename, $output_filestem, $output_type);
234    }
235}
236
237# Convert a Microsoft word 6/7/8 document
238
239sub convertWord678 {
240    my ($input_filename, $output_filestem, $output_type) = @_;
241
242    my $success = 0;
243    if (!$output_type || ($output_type =~ m/html/i)){
244    if ($windows_scripting) {
245        $success = &native_doc_to_html($input_filename, $output_filestem);
246    }
247    else {
248        $success = &doc_to_html($input_filename, $output_filestem);   
249    }
250    if ($success) {
251       return "html";
252    }
253    }
254    return &convertAnything($input_filename, $output_filestem, $output_type);
255}
256
257
258# Convert a Rich Text Format (RTF) file
259
260sub convertRTF {
261    my ($input_filename, $output_filestem, $output_type) = @_;
262
263    my $success = 0;
264
265    # Attempt specialised conversion to HTML
266    if (!$output_type || ($output_type =~ m/html/i)) {
267
268    if ($windows_scripting) {
269        $success = &native_doc_to_html($input_filename, $output_filestem);
270    }
271    else {
272        $success = &rtf_to_html($input_filename, $output_filestem);
273    }
274    if ($success) {
275        return "html";
276    }
277    }
278
279# rtf is so ugly that's it's not worth running strings over.
280# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
281#    return &convertAnything($input_filename, $output_filestem, $output_type);
282    return "fail";
283}
284
285
286# Convert an unidentified file
287
288sub convertAnything {
289    my ($input_filename, $output_filestem, $output_type) = @_;
290   
291    my $success = 0;
292 
293    # Attempt simple conversion to HTML
294    if (!$output_type || ($output_type =~ m/html/i)) {
295    $success = &any_to_html($input_filename, $output_filestem);
296    if ($success) {
297        return "html";
298    }
299    }
300
301    # Convert to text
302    if (!$output_type || ($output_type =~ m/text/i)) {
303    $success = &any_to_text($input_filename, $output_filestem);
304    if ($success) {
305        return "text";
306    }
307    }
308    return "fail";
309}
310
311
312
313# Convert an Adobe PDF document
314
315sub convertPDF {
316    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
317
318    my $success = 0;
319    $output_type =~ s/.*\-(.*)/$1/i;
320    # Attempt coversion to Image
321    if ($output_type =~ m/jp?g|gif|png/i) {
322    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
323    if ($success){
324        return "item";
325    }
326    }
327
328    # Attempt conversion to HTML
329    if (!$output_type || ($output_type =~ m/html/i)) {
330    $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
331    if ($success) {
332        return "html";
333    }
334    }
335
336    # Attempt conversion to TEXT
337    if (!$output_type || ($output_type =~ m/text/i)) {
338    $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
339    if ($success) {
340        return "text";
341    }
342    }
343
344    return "fail";
345
346}
347
348
349# Convert an Adobe PostScript document
350
351sub convertPS {
352    my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
353
354    my $success = 0;
355    $output_type =~ s/.*\-(.*)/$1/i;
356    # Attempt coversion to Image
357    if ($output_type =~ m/jp?g|gif|png/i) {
358    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
359    if ($success){
360        return "item";
361    }
362    }
363
364    # Attempt conversion to TEXT
365    if (!$output_type || ($output_type =~ m/text/i)) {
366    $success = &ps_to_text($input_filename, $output_filestem);
367    if ($success) {
368        return "text";
369    }
370    }
371    return "fail";
372}
373
374
375sub convertPPT {
376    my ($input_filename, $output_filestem, $output_type) = @_;
377    my $success = 0;
378
379    my $ppt_convert_type = "";
380
381    #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
382    if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
383    if ($output_type =~ m/gif/i) {
384        $ppt_convert_type = "-g";
385    } elsif ($output_type =~ m/jp?g/i){
386        $ppt_convert_type = "-j";
387    } elsif ($output_type =~ m/png/i){
388        $ppt_convert_type = "-p";
389    }
390    my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
391                       $ENV{'GSDLOS'}, "pptextract");
392    $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
393           
394    my $cmd = "";
395    if ($timeout) {$cmd = "ulimit -t $timeout;";}
396    # if the converting directory already exists
397    if (-d $output_filestem) {
398        print STDERR "**The conversion directory already exists\n";
399        return "item";
400    } else {
401        $cmd .=  "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
402        $cmd .= " 2>\"$output_filestem.err\""
403        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
404        if (system($cmd) !=0) {
405        print STDERR "Powerpoint VB Scripting convert failed\n";
406        } else {
407        return "item";
408        }
409    }
410    } elsif (!$output_type || ($output_type =~ m/html/i)) {
411    # Attempt conversion to HTML
412    #if (!$output_type || ($output_type =~ m/html/i)) {
413    # formulate the command
414    my $cmd = "";
415    my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
416    $cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
417    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
418    $cmd .= " 2>\"$output_filestem.err\""
419        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
420
421    # execute the command
422    $!=0;
423    if (system($cmd)!=0)
424    {
425        print STDERR "Powerpoint 95/97 converter failed $!\n";
426    } else {
427        return "html";
428    }
429    }
430
431    $success = &any_to_text($input_filename, $output_filestem);
432    if ($success) {
433    return "text";
434    }
435   
436    return "fail";
437}
438
439
440sub convertXLS {
441    my ($input_filename, $output_filestem, $output_type) = @_;
442
443    my $success = 0;
444
445    # Attempt conversion to HTML
446    if (!$output_type || ($output_type =~ m/html/i)) {
447    # formulate the command
448    my $cmd = "";
449    my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
450    $cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
451    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
452    $cmd .= " 2>\"$output_filestem.err\""
453        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
454   
455   
456    # execute the command
457    $!=0;
458    if (system($cmd)!=0)
459    {
460        print STDERR "Excel 95/97 converter failed $!\n";
461    } else {
462        return "html";
463    }
464    }
465
466    $success = &any_to_text($input_filename, $output_filestem);
467    if ($success) {
468    return "text";
469    }
470
471    return "fail";
472}
473
474
475
476# Find the real type of a .doc file
477#
478# We seem to have a lot of files with a .doc extension that are .rtf
479# files or Word 5 files.  This function attempts to tell the difference.
480sub find_docfile_type {
481    my ($input_filename) = @_;
482   
483    if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
484        return "docx";
485    }
486   
487    open(CHK, "<$input_filename");
488    binmode(CHK);
489    my $line = "";
490    my $first = 1;
491
492    while (<CHK>) {
493   
494    $line = $_;
495
496    if ($first) {
497        # check to see if this is an rtf file
498        if ($line =~ m/^\{\\rtf/) {
499        close(CHK);
500        return "rtf";
501        }
502        $first = 0;
503    }
504   
505    # is this is a word 6/7/8 document?
506    if ($line =~ m/Word\.Document\.([678])/) {
507        close(CHK);
508
509        return "word$1";
510    }
511
512    }
513
514    return "unknown";
515}
516
517
518# Specific type-to-type conversions
519#
520# Each of the following functions attempts to convert a document from
521# a specific format to another.  If they succeed they return 1 and leave
522# the output document(s) in the appropriate place; if they fail they
523# return 0 and delete any working files.
524
525
526# Attempt to convert a word document to html with the wv program
527sub doc_to_html {
528    my ($input_filename, $output_filestem) = @_;
529
530    my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
531
532    if ( -d "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv" && $ENV{'GSDLOS'} eq "linux" ) {
533        $ENV{'PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/bin:$ENV{'PATH'}";
534        $ENV{'LD_LIBRARY_PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/lib:$ENV{'LD_LIBRARY_PATH'}";
535        $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv", "bin", "wvWare");
536    }
537
538    # don't include path on windows (to avoid having to play about
539    # with quoting when GSDLHOME might contain spaces) but assume
540    # that the PATH is set up correctly
541    $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
542
543    my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
544                      "packages", "wv", "wvHtml.xml");
545   
546    # Added the following to work with replace_srcdoc_with_html.pl:
547    # Make wvWare put any associated (image) files of the word doc into
548    # folder docname-without-extention_files. This folder should be at
549    # the same level as the html file generated from the doc.
550    # wvWare will take care of proper interlinking.
551
552    # This step is necessary for replace_srcdoc_with_html.pl which will
553    # move the html and associated files into the import folder. We
554    # want to ensure that the associated files won't overwrite similarly
555    # named items already in import. Hence we put them in a folder first
556    # (to which the html links properly) and that will allow
557    # replace_srcdoc_with_html.pl to move them safely to /import.
558
559    # To do all this, we need to use wvWare's --dir and --basename options
560    # where dir is the full path to the image folder directory and
561    # basename is the full path to the image folder appended to the name
562    # which is to be prepended to every image file:
563    # eg. if the images were to have names like sample0.jpg to sampleN.jpg,
564    # then the basename is "/full/path/to/imgdir/sample".
565    # In this case, basename is the full path to and name of the document.
566    # HOWEVER: basename always takes full path, not relative url, so
567    # the greenstone browser is unable to display the images (absolute paths
568    # cause it to give an "external link" message)
569    # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
570    # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
571    # "added --dir option to wvHtml so that pictures can be placed in
572    # a seperate directory"
573    # "running wvWare through IMP to view word documents as html. It gets
574    # invoked like this:
575    # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
576   
577    # toppath is the folder where html is generated
578    # docname is the name (without extension) of the html to be generated
579    # suffix (extension) is thrown away
580    my ($docname, $toppath)
581    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
582
583    # We want the image folder generated to have the same name as windows
584    # would generate ($windows_scripting) when it converts from word to html.
585    # That is, foldername=docname_files
586    my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
587    #print "assoc_dir: ".$assoc_dir."\n";  # same as "$output_filestem._files"
588   
589    # ensure this image directory exists
590    # if it exists already, just delete and recreate
591    if(-e $assoc_dir) {
592    &util::rm_r($assoc_dir);
593    } 
594    &util::mk_dir($assoc_dir);
595
596    # the images are all going to be called image0, image1,..., imageN
597    my $img_basenames = &util::filename_cat($assoc_dir, $docname);
598   
599    #print STDERR "****toppath: $toppath\n****docname: $docname\n;
600    #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
601    #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
602
603    my $cmd = "";
604    if ($timeout) {$cmd = "ulimit -t $timeout;";}
605    # wvWare's --dir and --basename options for image directory.
606    # Replaced the next line with the *2 lines* following it:
607               # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
608    $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
609    $cmd .= " --charset utf-8 --config \"$wv_conf\"";
610    $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
611
612    # redirecting STDERR is a bad idea on windows 95/98
613    $cmd .= " 2> \"$output_filestem.err\""
614    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
615    # execute the command
616    $!=0;
617    if (system($cmd)!=0)
618    {
619    print STDERR "Error executing wv converter:$!\n";
620    if (-s "$output_filestem.err") {
621        open (ERRFILE, "<$output_filestem.err");
622
623        my $write_to_fail_log=0;
624        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
625        {$write_to_fail_log=1;}
626
627        my $line;
628        while ($line=<ERRFILE>) {
629        if ($line =~ m/\w/) {
630            print STDERR "$line";
631            print FAILLOG "$line" if ($write_to_fail_log);
632        }
633        if ($line !~ m/startup error/) {next;}
634        print STDERR " (given an invalid .DOC file?)\n";
635        print FAILLOG " (given an invalid .DOC file?)\n"
636        if ($write_to_fail_log);
637       
638        } # while ERRFILE
639        close FAILLOG if ($write_to_fail_log);
640    }
641    return 0; # we can try any_to_text
642    }
643
644    # Was the conversion successful?
645
646    if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
647    open(TMP, "$output_filestem.html");
648    my $line = <TMP>;
649    close(TMP);
650    if ($line && $line =~ m/DOCTYPE HTML/) {
651        &util::rm("$output_filestem.err") if -e "$output_filestem.err";   
652
653        # Inserted this code to remove the images directory if it was still empty after
654        # the html was generated (in case there were no images in the word document)
655        if (&util::is_dir_empty($assoc_dir)) {
656        #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n";
657        &util::rm_r($assoc_dir);
658        } else { # there was an image folder (it was generated)
659        # Therefore, the html file generated contains absolute links to the images
660        # Replace them with relative links instead, so the folder can be moved elsewhere
661        &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");   
662        }
663        return 1;
664    }
665    }
666   
667    # If here, an error of some sort occurred
668    &util::rm("$output_filestem.html") if -e "$output_filestem.html";
669    if (-e "$output_filestem.err") {
670    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
671        open (ERRLOG,"$output_filestem.err");
672        while (<ERRLOG>) {print FAILLOG $_;}
673        close FAILLOG;
674        close ERRLOG;
675    }
676    &util::rm("$output_filestem.err");
677    }
678   
679    return 0;
680}
681
682# Method to work with doc_to_html - Word docs might contain images.
683# When such word docs are converted with wvWare, we make it generate a
684# <filename>_files folder with the associated images, while the html file
685# <filename> refers to the images using absolute paths to <filename>_files.
686# This method reads in that html file and replaces all the absolute paths to
687# the images in <filename>_files with the relative paths to the images from
688# that folder. (I.e. with <filename>_files/<imagename.ext>).
689sub make_links_to_assocdir_relative{
690    # toppath is the top-level folder in which the html file we're going to be fixing resides
691    # docname is just the name (without extension) of the html file
692    # html_file is the full path to the html file: /full/path/docname.html
693    # assoc_dir_path is toppath/docname_files
694    # assoc_dirname is the directory name of the folder with associated imgs: docname_files
695    my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
696
697    # 1. Read all the contents of the html into a string
698    # open the original file for reading
699    unless(open(FIN, "<$html_file")) {
700    print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
701    return 0;
702    }
703    # From http://perl.plover.com/local.html
704    # "It's cheaper to read the file all at once, without all the splitting and reassembling.
705    # (Some people call this slurping the file.) Perl has a special feature to support this:
706    # If the $/ variable is undefined, the <...> operator will read the entire file all at once"
707    my $html_contents;
708    {
709    local $/ = undef;        # Read entire file at once
710    $html_contents = <FIN>;  # Now file is read in as one single 'line'
711    }
712    close(FIN); # close the file
713    #print STDERR $html_contents;
714   
715    # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src
716    # values with assoc_dirname
717    # At the end: g means substitute all occurrences (global), while s at the end means treat
718    # all new lines as a regular space. This interacts with g to consider all the lines
719    # together as a single line so that multi-occurrences can be replaced.
720
721    # we can't just replace $assoc_dir_path with $assoc_dir
722    # $assoc_dir_path represents a regular expression that needs to be replaced
723    # if it contains ., -, [, ], or Windows style backslashes in paths  -- which all have special
724    # meaning in Perl regular expressions -- we need to escape these first
725    my $safe_reg_expression = $assoc_dir_path;
726    $safe_reg_expression =~ s/\\/\\\\/g;
727    $safe_reg_expression =~ s/\./\\./g;
728    $safe_reg_expression =~ s/\-/\\-/g;
729    $safe_reg_expression =~ s/\[/\\[/g;
730    $safe_reg_expression =~ s/\]/\\]/g;
731    $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
732
733    # The following regular expression substitution looks for <a or <image, followed by any other
734    # attributes and values until it comes to the FIRST (indicated by ?) href= or src=
735    # followed by " or ' no quotes at all around path, followed by the associated folder's pathname
736    # followed by characters (for the img filename), then finally the optional closing quotes
737    # in " or ' form, followed by any other attributes and values until the first > to end the tag.
738    # The substitution: all the parts preceding associated folder's pathname are retained,
739    # the associated folder path name is replaced by associated folder directory name
740    # and the rest upto and including the closing > tag is retained.
741    # The sg at the end of the pattern match treats all of html_contents as a single line (s)
742    # and performs a global replace (g) meaning that all occurrences that match in that single line
743    # are substituted.
744    $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg;
745               #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
746    # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
747    $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
748
749    #print STDERR "****assoc_dirname: $assoc_dirname***\n";
750    #print STDERR "****safe_reg_expression: $safe_reg_expression***\n";
751   
752    # delete the original file and recreate it
753    my $copy_of_filename = $html_file;
754    &util::rm($copy_of_filename); # deleted the file
755
756    # Recreate the original file for writing the updated contents
757    unless(open(FOUT, ">$html_file")) {  # open it as a new file for writing
758    print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
759    return 0;
760    }
761
762    # write out the updated contents and close the file
763    print FOUT $html_contents;
764    close(FOUT);
765    return 1;
766}
767
768# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
769# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
770# introduced in link pathnames by wvWare into space again. Converts all percent signs
771# introduced by URL encoding filenames generated into %25 in these url links referencing them
772sub post_process_assocfile_urls
773{
774    my ($pre, $text, $post) = @_;
775
776    $text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
777    # $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
778    $text =~ s/\\/\//g;
779    $text =~ s/%/%25/g;
780
781    return "$pre$text$post";
782}
783
784# Attempt to convert a word document to html with the word2html scripting program
785sub native_doc_to_html {
786    my ($input_filename, $output_filestem) = @_;
787
788    my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
789                       $ENV{'GSDLOS'}, "word2html");
790
791    $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
792    if (-e "$output_filestem.html") {
793    print STDERR "    The conversion file:\n";
794    print STDERR "      $output_filestem.html\n";
795    print STDERR "    ... already exists.  Skipping\n";
796    return 1;
797    }
798
799    my $cmd = "";
800    if ($timeout) {$cmd = "ulimit -t $timeout;";}
801    #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
802    #$cmd .=  "$vbScript $input_filename $output_filestem.html";
803    $cmd .=  "$vbScript \"$input_filename\" \"$output_filestem.html\"";
804
805    # redirecting STDERR
806    $cmd .= " 2> \"$output_filestem.err\""
807    if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
808   
809    # execute the command
810    $!=0;
811    if (system($cmd)!=0)
812    {
813    print STDERR "Error executing word2Html converter:$!\n";
814    if (-s "$output_filestem.err") {
815        open (ERRFILE, "<$output_filestem.err");
816       
817        my $write_to_fail_log=0;
818        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
819        {$write_to_fail_log=1;}
820
821        my $line;
822        while ($line=<ERRFILE>) {
823        if ($line =~ m/\w/) {
824            print STDERR "$line";
825            print FAILLOG "$line" if ($write_to_fail_log);
826        }
827        if ($line !~ m/startup error/) {next;}
828        print STDERR " (given an invalid .DOC file?)\n";
829        print FAILLOG " (given an invalid .DOC file?)\n"
830        if ($write_to_fail_log);
831       
832        } # while ERRFILE
833        close FAILLOG if ($write_to_fail_log);
834    }
835    return 0; # we can try any_to_text
836    }
837
838    # Was the conversion successful?
839    if (-s "$output_filestem.html") {
840    open(TMP, "$output_filestem.html");
841    my $line = <TMP>;
842    close(TMP);
843    if ($line && $line =~ m/html/i) {
844        &util::rm("$output_filestem.err") if -e "$output_filestem.err";
845        return 1;
846    }
847    }
848   
849    # If here, an error of some sort occurred
850    &util::rm("$output_filestem.html") if -e "$output_filestem.html";
851    if (-e "$output_filestem.err") {
852    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
853        open (ERRLOG,"$output_filestem.err");
854        while (<ERRLOG>) {print FAILLOG $_;}
855        close FAILLOG;
856        close ERRLOG;
857    }
858    &util::rm("$output_filestem.err");
859    }
860    return 0;
861}
862
863# Attempt to convert an RTF document to html with rtftohtml
864sub rtf_to_html {
865    my ($input_filename, $output_filestem) = @_;
866
867    # formulate the command
868    my $cmd = "";
869    if ($timeout) {$cmd = "ulimit -t $timeout;";}
870    $cmd .= "rtftohtml";
871    #$cmd .= "rtf-converter";
872
873    $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
874
875    $cmd .= " 2>\"$output_filestem.err\""
876        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
877
878
879    # execute the command
880    $!=0;
881    if (system($cmd)!=0)
882    {
883    print STDERR "Error executing rtf converter $!\n";
884    # don't currently bother printing out error log...
885    # keep going, in case it still created an HTML file...
886    }
887
888    # Was the conversion successful?
889    my $was_successful=0;
890    if (-s "$output_filestem.html") {
891    # make sure we have some content other than header
892    open (HTML, "$output_filestem.html"); # what to do if fail?
893    my $line;
894    my $past_header=0;
895    while ($line=<HTML>) {
896
897        if ($past_header == 0) {
898        if ($line =~ m/<body>/) {$past_header=1;}
899        next;
900        }
901
902        $line =~ s/<[^>]+>//g;
903        if ($line =~ m/\w/ && $past_header) {  # we found some content...
904        $was_successful=1;
905        last;
906        }
907    }
908    close HTML;
909    }
910
911    if ($was_successful) {
912    &util::rm("$output_filestem.err")
913        if (-e "$output_filestem.err");
914    # insert the (modified) table of contents, if it exists.
915    if (-e "${output_filestem}_ToC.html") {
916        &util::mv("$output_filestem.html","$output_filestem.src");
917        my $open_failed=0;
918        open HTMLSRC, "$output_filestem.src" || ++$open_failed;
919        open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
920        open HTML, ">$output_filestem.html" || ++$open_failed;
921       
922        if ($open_failed) {
923        close HTMLSRC;
924        close TOC;
925        close HTML;
926        &util::mv("$output_filestem.src","$output_filestem.html");
927        return 1;
928        }
929
930        # print out header info from src html.
931        while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
932        print HTML "$_";
933        }
934
935        # print out table of contents, making links relative
936        <TOC>; <TOC>; # ignore first 2 lines
937        print HTML scalar(<TOC>); # line 3 = "<ol>\n"
938        my $line;
939        while ($line=<TOC>) {
940        $line =~ s@</body></html>$@@i ; # only last line has this
941        # make link relative
942        $line =~ s@href=\"[^\#]+@href=\"@i;
943        print HTML $line;
944        }
945        close TOC;
946
947        # rest of html src
948        while (<HTMLSRC>) {
949        print HTML $_;
950        }
951        close HTMLSRC;
952        close HTML;
953
954        &util::rm("${output_filestem}_ToC.html");
955        &util::rm("${output_filestem}.src");
956    }
957    # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
958    return 1; # success
959    }
960
961    if (-e "$output_filestem.err") {
962    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
963    {
964        print FAILLOG "Error - rtftohtml - couldn't extract text\n";
965        #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
966        print FAILLOG " (rtf file might be too recent):\n";
967        open (ERRLOG, "$output_filestem.err");
968        while (<ERRLOG>) {print FAILLOG $_;}
969        close ERRLOG;
970        close FAILLOG;
971    }
972    &util::rm("$output_filestem.err");
973    }
974
975    &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
976
977    return 0;
978}
979
980
981# Convert a pdf file to html with the pdftohtml command
982
983sub pdf_to_html {
984    my ($dirname, $input_filename, $output_filestem) = @_;
985
986    my $cmd = "";
987    if ($timeout) {$cmd = "ulimit -t $timeout;";}
988    my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
989    $cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
990    $cmd .= " -c" if ($pdf_complex);
991    $cmd .= " -i" if ($pdf_ignore_images);
992    $cmd .= " -a" if ($pdf_allow_images_only);
993    $cmd .= " -hidden" unless ($pdf_nohidden);
994    $cmd .= " \"$input_filename\" \"$output_filestem\"";
995   
996    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
997    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
998    } else {
999    $cmd .= " > \"$output_filestem.err\"";
1000    }
1001
1002    $!=0;
1003
1004    my $retval=system($cmd);
1005    if ($retval!=0)
1006    {
1007    print STDERR "Error executing pdftohtml.pl";
1008    if ($!) {print STDERR ": $!";}
1009    print STDERR "\n";
1010    }
1011
1012    # make sure the converter made something
1013    if ($retval!=0 || ! -s "$output_filestem.html")
1014    {
1015    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1016    # print out the converter's std err, if any
1017    if (-s "$output_filestem.err") {
1018        open (ERRLOG, "$output_filestem.err") || die "$!";
1019        print STDERR "pdftohtml error log:\n";
1020        while (<ERRLOG>) {
1021        print STDERR "$_";
1022        }
1023        close ERRLOG;
1024    }
1025    print STDERR "***********output filestem $output_filestem.html\n";
1026    &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1027    if (-e "$output_filestem.err") {
1028        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1029        {
1030        open (ERRLOG, "$output_filestem.err");
1031        while (<ERRLOG>) {print FAILLOG $_;}
1032        close ERRLOG;
1033        close FAILLOG;
1034        }   
1035        &util::rm("$output_filestem.err");
1036    }
1037    return 0;
1038    }
1039
1040    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1041    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1042    return 1;
1043}
1044
1045# Convert a pdf file to various types of image with the convert command
1046
1047sub pdfps_to_img {
1048    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
1049
1050    # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
1051    if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
1052    my $result = `identify 2>&1`;
1053    if ($? == -1 || $? == 256) {  # Linux and Windows return different values for "program not found"
1054        #ImageMagick is not installed, thus the convert utility is not available.
1055        print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
1056        return 0;
1057    }
1058    }
1059
1060    my $cmd = "";
1061    if ($timeout) {$cmd = "ulimit -t $timeout;";}
1062    $output_type =~ s/.*\_(.*)/$1/i;
1063    my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
1064    $cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
1065    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
1066    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1067    } else {
1068    $cmd .= " > \"$output_filestem.err\"";
1069    }
1070
1071    # don't include path on windows (to avoid having to play about
1072    # with quoting when GSDLHOME might contain spaces) but assume
1073    # that the PATH is set up correctly
1074    $!=0;
1075    my $retval=system($cmd);
1076    if ($retval!=0)
1077    {
1078    print STDERR "Error executing pdftoimg.pl";
1079    if ($!) {print STDERR ": $!";}
1080    print STDERR "\n";
1081    }
1082
1083    #make sure the converter made something
1084    #if ($retval !=0) || ! -s "$output_filestem")
1085    if ($retval !=0)
1086    {
1087    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1088    #print out the converter's std err, if any
1089    if (-s "$output_filestem.err") {
1090        open (ERRLOG, "$output_filestem.err") || die "$!";
1091        print STDERR "pdfpstoimg error log:\n";
1092        while (<ERRLOG>) {
1093        print STDERR "$_";
1094        }
1095        close ERRLOG;
1096    }
1097    #&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1098    if (-e "$output_filestem.err") {
1099        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1100        {
1101        open (ERRLOG, "$output_filestem.err");
1102        while (<ERRLOG>) {print FAILLOG $_;}
1103        close ERRLOG;
1104        close FAILLOG;
1105       }   
1106        &util::rm("$output_filestem.err");
1107    }
1108    return 0;
1109    }
1110    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1111    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1112    return 1;
1113}
1114
1115# Convert a PDF file to text with the pdftotext command
1116
1117sub pdf_to_text {
1118    my ($dirname, $input_filename, $output_filestem) = @_;
1119
1120    my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1121
1122    if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1123    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1124    } else {
1125    $cmd .= " > \"$output_filestem.err\"";
1126    }
1127   
1128    if (system($cmd)!=0)
1129    {
1130    print STDERR "Error executing $cmd: $!\n";
1131    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1132    }
1133
1134    # make sure there is some extracted text.
1135    if (-e "$output_filestem.text") {
1136    open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
1137    binmode(EXTR_TEXT); # just in case...
1138    my $line="";
1139    my $seen_text=0;
1140    while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1141        if ($line=~ m/\w/) {$seen_text=1;}
1142    }
1143    close EXTR_TEXT;
1144    if ($seen_text==0) { # no text was extracted
1145        print STDERR "Error: pdftotext found no text\n";
1146        &util::rm("$output_filestem.text");
1147    }
1148    }
1149
1150    # make sure the converter made something
1151    if (! -s "$output_filestem.text")
1152    {
1153    # print out the converters std err, if any
1154    if (-s "$output_filestem.err") {
1155        open (ERRLOG, "$output_filestem.err") || die "$!";
1156        print STDERR "pdftotext error log:\n";
1157        while (<ERRLOG>) {
1158        print STDERR "$_";
1159        }
1160        close ERRLOG;
1161    }
1162    # does this converter create a .out file?
1163    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1164    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1165    if (-e "$output_filestem.err") {
1166        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1167        {
1168        open (ERRLOG,"$output_filestem.err");
1169        while (<ERRLOG>) {print FAILLOG $_;}
1170        close ERRLOG;
1171        close FAILLOG;
1172        }
1173        &util::rm("$output_filestem.err");
1174    }
1175    return 0;
1176    }
1177    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1178    return 1;
1179}
1180
1181# Convert a PostScript document to text
1182# note - just using "ps2ascii" isn't good enough, as it
1183# returns 0 for a postscript interpreter error. ps2ascii is just
1184# a wrapper to "gs" anyway, so we use that cmd here.
1185
1186sub ps_to_text {
1187    my ($input_filename, $output_filestem) = @_;
1188
1189    my $error = "";
1190
1191    # if we're on windows we'll fall straight through without attempting
1192    # to use gs
1193    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1194    $error = "Windows does not support gs";
1195
1196    } else {
1197    my $cmd = "";
1198    if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1199    $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1200    $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1201    #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1202    $cmd .= " 2> $output_filestem.err";
1203    $!=0;
1204
1205    my $retcode=system($cmd);
1206    $retcode = $? >> 8;  # see man perlfunc - system for this...
1207    # if system returns -1 | 127 (couldn't start program), look at $! for message
1208
1209    if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1210    elsif (! -e "$output_filestem.text") {
1211        $error="did not create output file.\n";
1212    }
1213    else
1214    {   # make sure the interpreter didn't get an error. It is technically
1215        # possible for the actual text to start with this, but....
1216        open PSOUT, "$output_filestem.text";
1217        if (<PSOUT> =~ m/^Error: (.*)/) {
1218        $error="interpreter error - \"$1\"";
1219        }
1220        close PSOUT;
1221    }
1222    }
1223
1224    if ($error ne "")
1225    {
1226    print STDERR "Warning: Error executing gs: $error\n";
1227    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1228
1229    if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1230    {
1231        print FAILLOG "gs - $error\n";
1232        if (-e "$output_filestem.err") {
1233        open(ERRLOG, "$output_filestem.err");
1234        while (<ERRLOG>) {print FAILLOG $_;}
1235        close ERRLOG;
1236        }
1237        close FAILLOG;
1238    }
1239    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1240
1241
1242    # Fine then. We'll just do a lousy job by ourselves...
1243    # Based on 5-line regexp sed script found at:
1244    # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1245    #
1246    print STDERR "Stripping text from postscript\n";
1247    my $errorcode=0;
1248    open (IN, "$input_filename")
1249        ||  ($errorcode=1, warn "Couldn't read file: $!");
1250    open (OUT, ">$output_filestem.text")
1251        ||  ($errorcode=1, warn "Couldn't write file: $!");
1252    if ($errorcode) {print STDERR "errors\n";return 0;}
1253   
1254    my $text="";  # this is for whole .ps file...
1255    $text = join('', <IN>); # see man perlport, under "System Resources"
1256    close IN;
1257
1258    # Make sure this is a ps file...
1259    if ($text !~ m/^%!/) {
1260        print STDERR "Bad postscript header: not '%!'\n";
1261        if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1262        {
1263        print FAILLOG "Bad postscript header: not '%!'\n";
1264        close FAILLOG;
1265        }
1266        return 0;
1267    }
1268
1269    # if ps has Page data, then use it to delete all stuff before it.
1270    $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1271   
1272    # remove all leading non-data stuff
1273    $text =~ s/^.*?\(//s;
1274
1275    # remove all newline chars for easier processing
1276    $text =~ s/\n//g;
1277   
1278    # Big assumption here - assume that if any co-ordinates are
1279    # given, then we are at the end of a sentence.
1280    $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1281
1282    # special characters--
1283    $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1284
1285    # ? ps text formatting (eg italics?) ?
1286    $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1287    $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1288    $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1289    # default - remove the rest
1290    $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1291
1292    # attempt to add whitespace between words...
1293    # this is based purely on observation, and may be completely wrong...
1294    $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1295    # eg I notice "b(" is sometimes NOT a space if preceded by a
1296    # negative number.
1297    $text =~ s/\)\d+ ?b\(/\) \( /g;
1298
1299    # change quoted braces to brackets
1300    $text =~ s/([^\\])\\\(/$1\{/g;
1301    $text =~ s/([^\\])\\\)/$1\}/g ;
1302
1303    # remove everything that is not between braces
1304    $text =~ s/\)([^\(\)])+?\(//sg ;
1305   
1306    # remove any Trailer eof stuff.
1307    $text =~ s/\)[^\)]*$//sg;
1308
1309    ### ligatures have special characters...
1310    $text =~ s/\\013/ff/g;
1311    $text =~ s/\\014/fi/g;
1312    $text =~ s/\\015/fl/g;
1313    $text =~ s/\\016/ffi/g;
1314    $text =~ s/\\214/fi/g;
1315    $text =~ s/\\215/fl/g;
1316    $text =~ s/\\017/\n\* /g; # asterisk?
1317    $text =~ s/\\023/\023/g;  # e acute ('e)
1318    $text =~ s/\\177/\252/g;  # u"
1319#   $text =~ s/ ?? /\344/g;  # a"
1320
1321    print OUT "$text";
1322    close OUT;
1323    }
1324    # wrap the text - use a minimum length. ie, first space after this length.
1325    my $wrap_length=72;
1326    &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1327    open INFILE, "$output_filestem.text.tmp" ||
1328    die "Couldn't open file: $!";
1329    open OUTFILE, ">$output_filestem.text" ||
1330    die "Couldn't open file for writing: $!";
1331    my $line="";
1332    while ($line=<INFILE>) {
1333    while (length($line)>0) {
1334        if (length($line)>$wrap_length) {
1335        $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1336        print OUTFILE "$1\n";
1337        } else {
1338        print OUTFILE "$line";
1339        $line="";
1340        }
1341    }
1342    }
1343    close INFILE;
1344    close OUTFILE;
1345    &util::rm("$output_filestem.text.tmp");
1346
1347    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1348    return 1;
1349}
1350
1351
1352# Convert any file to HTML with a crude perl implementation of the
1353# UNIX strings command.
1354
1355sub any_to_html {
1356    my ($input_filename, $output_filestem) = @_;
1357
1358    # First generate a text file
1359    return 0 unless (&any_to_text($input_filename, $output_filestem));
1360
1361    # create an HTML file from the text file
1362    open(TEXT, "<$output_filestem.text");
1363    open(HTML, ">$output_filestem.html");
1364
1365    print HTML "<html><head>\n";
1366    print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1367    print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1368    print HTML "</head><body>\n\n";
1369
1370    my $line;
1371    while ($line=<TEXT>) {
1372    $line =~ s/</&lt;/g;
1373    $line =~ s/>/&gt;/g;
1374    if ($line =~ m/^\s*$/) {
1375        print HTML "<p>";
1376    } else {
1377        print HTML "<br> ", $line;
1378    }
1379    }
1380    print HTML "\n</body></html>\n";
1381
1382    close HTML;
1383    close TEXT;
1384
1385    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1386    return 1;
1387}
1388
1389# Convert any file to TEXT with a crude perl implementation of the
1390# UNIX strings command.
1391# Note - this assumes ascii charsets :(     (jrm21)
1392
1393sub any_to_text {
1394    my ($input_filename, $output_filestem) = @_;
1395
1396    if (!$use_strings) {
1397      return 0;
1398    }
1399
1400    print STDERR "\n**** In any to text****\n\n";
1401    open(IN, "<$input_filename") || return 0;
1402    binmode(IN);
1403    open(OUT, ">$output_filestem.text") || return 0;
1404
1405    my ($line);
1406    my $output_line_count = 0;
1407    while (<IN>) {
1408    $line = $_;
1409
1410    # delete anything that isn't a printable character
1411    $line =~ s/[^\040-\176]+/\n/sg;
1412
1413    # delete any string less than 10 characters long
1414    $line =~ s/^.{0,9}$/\n/mg;
1415    while ($line =~ m/^.{1,9}$/m) {
1416        $line =~ s/^.{0,9}$/\n/mg;
1417        $line =~ s/\n+/\n/sg;
1418    }
1419
1420    # remove extraneous whitespace
1421    $line =~ s/\n+/\n/gs;
1422    $line =~ s/^\n//gs;
1423
1424    # output whatever is left
1425    if ($line =~ m/[^\n ]/) {
1426        print OUT $line;
1427        ++$output_line_count;
1428    }
1429    }
1430
1431    close OUT;
1432    close IN;
1433
1434    if ($output_line_count) { # try to protect against binary only formats
1435    return 1;
1436    }
1437
1438    &util::rm("$output_filestem.text");
1439    return 0;
1440
1441}
Note: See TracBrowser for help on using the browser.