root/main/trunk/greenstone2/bin/script/gsConvert.pl @ 22596

Revision 22596, 48.8 KB (checked in by kjdon, 9 years ago)

changed output options from pagedimage to pagedimg so they match convert_to options of plugins

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs.  The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use strict;
52
53use parsargv;
54use util;
55use Cwd;
56use File::Basename;
57
58# Are we running on WinNT or Win2000 (or later)?
59my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62my $use_strings;
63my $pdf_complex;
64my $pdf_nohidden;
65my $pdf_zoom;
66my $pdf_ignore_images;
67my $pdf_allow_images_only;
68my $windows_scripting;
69my $openoffice_scripting;
70
71sub print_usage
72{
73    print STDERR "\n";
74    print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
75    print STDERR "              or text using third-party programs.\n\n";
76    print STDERR "  usage: $0 [options] filename\n";
77    if ($openoffice_scripting) {
78    print STDERR "  options:\n\t-type\tdoc|dot|docx|odf|pdf|ps|ppt|rtf|xls\t(input file type)\n";
79    }
80    else {
81    print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
82    }
83    print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
84    print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
85    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
86    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
87    print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
88    print STDERR "\t-openoffice_scripting\tuse OpenOffice (if available) to convert Microsoft Office documents \n";
89    print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
90    print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
91    print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
92    print STDERR "\t\tconverting PDF to HTML\n";
93    print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
94    print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
95    print STDERR "\t\t-pdf_complex is set\n";
96    exit(1);
97}
98
99my $faillogfile="";
100my $timeout=0;
101
102sub main
103{
104    my (@ARGV) = @_;
105    my ($input_type,$output_type,$verbose);
106
107   
108    # scan for -openoffice_scripting as it effects the permissible
109    # values for -type
110
111    foreach my $a (@ARGV) {
112    if ($a =~ m/^-openoffice_scripting$/) {
113        $openoffice_scripting = 1;
114        last;
115    }
116    }
117
118    my $parse_type;
119    if ($openoffice_scripting) {
120    $parse_type = 'type/(doc|dot|docx|odf|pdf|ps|ppt|rtf|xls)/';
121    }
122    else {
123    $parse_type = 'type/(doc|dot|pdf|ps|ppt|rtf|xls)/';
124    }
125
126    # read command-line arguments
127    if (!parsargv::parse(\@ARGV,
128             $parse_type, \$input_type,
129             '/errlog/.*/', \$faillogfile,
130             'output/(auto|html|text|pagedimg).*/', \$output_type,
131             'timeout/\d+/0',\$timeout,
132             'verbose/\d+/0', \$verbose,
133             'windows_scripting',\$windows_scripting,
134             'openoffice_scripting',\$openoffice_scripting,
135             'use_strings', \$use_strings,
136             'pdf_complex', \$pdf_complex,
137             'pdf_ignore_images', \$pdf_ignore_images,
138             'pdf_allow_images_only', \$pdf_allow_images_only,
139             'pdf_nohidden', \$pdf_nohidden,
140             'pdf_zoom/\d+/2', \$pdf_zoom
141             ))
142    {
143    print_usage();
144    }
145     
146    # Make sure the input file exists and can be opened for reading
147    if (scalar(@ARGV!=1)) {
148    print_usage();
149    }
150
151    my $input_filename = $ARGV[0];
152    if (!-r $input_filename) {
153    print STDERR "Error: unable to open $input_filename for reading\n";
154    exit(1);
155    }
156
157    # Deduce filenames
158    my ($tailname,$dirname,$suffix)
159    = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
160    my $output_filestem = &util::filename_cat($dirname, "$tailname");
161
162    if ($input_type eq "")
163    {
164    $input_type = lc (substr($suffix,1,length($suffix)-1));
165    }
166   
167    # Change to temporary working directory
168    my $stored_dir = cwd();
169    chdir ($dirname) || die "Unable to change to directory $dirname";
170
171    # Select convert utility
172    if (!defined $input_type) {
173    print STDERR "Error: No filename extension or input type defined\n";
174    exit(1);
175    }
176    elsif ($openoffice_scripting && (($input_type eq "docx") || ($input_type eq "odf"))) {
177    print &convertDOC($input_filename, $output_filestem, $output_type);
178    print "\n";
179    }
180    elsif ($input_type eq "doc" || $input_type eq "dot") {
181    print &convertDOC($input_filename, $output_filestem, $output_type);
182    print "\n";
183    }
184    elsif ($input_type eq "rtf") {
185    print &convertRTF($input_filename, $output_filestem, $output_type);
186    print "\n";
187    }
188    elsif ($input_type eq "pdf") {
189    print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
190    print "\n";
191    }
192    elsif ($input_type eq "ps") {
193    print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
194    print "\n";
195    }
196    elsif ($input_type eq "ppt") {
197    print &convertPPT($input_filename, $output_filestem, $output_type);
198    print "\n";
199    }
200    elsif ($input_type eq "xls") {
201    print &convertXLS($input_filename, $output_filestem, $output_type);
202    print "\n";
203    }
204    else {
205    print STDERR "Error: Unable to convert type '$input_type'\n";
206    exit(1);
207    }
208   
209    # restore to original working directory
210    chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
211
212}
213
214&main(@ARGV);
215
216
217
218# Document-type conversion functions
219#
220# The following functions attempt to convert documents from their
221# input type to the specified output type.  If no output type was
222# given, then they first attempt HTML, and then TEXT.
223#
224# Each returns the output type ("html" or "text") or "fail" if no
225# conversion is possible.
226
227# Convert a Microsoft word document
228
229sub convertDOC {
230    my ($input_filename, $output_filestem, $output_type) = @_;
231
232    if (($openoffice_scripting) && ($input_filename =~ m/\.docx?$/i)) {
233    # Jump right in and process with Open Office
234        if (openoffice_doc_to_html($input_filename, $output_filestem)) {
235        return "html";
236    }
237    else {
238        return "fail";
239    }
240    }
241
242    # Many .doc files are not in fact word documents!
243    my $realtype = &find_docfile_type($input_filename);
244
245    if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
246    return &convertWord678($input_filename, $output_filestem, $output_type);
247    } elsif ($realtype eq "rtf") {
248    return &convertRTF($input_filename, $output_filestem, $output_type);
249    } else {
250    return &convertAnything($input_filename, $output_filestem, $output_type);
251    }
252}
253
254# Convert a Microsoft word 6/7/8 document
255
256sub convertWord678 {
257    my ($input_filename, $output_filestem, $output_type) = @_;
258
259    my $success = 0;
260    if (!$output_type || ($output_type =~ m/html/i)){
261    if ($windows_scripting) {
262        $success = &native_doc_to_html($input_filename, $output_filestem);
263    }
264    elsif ($openoffice_scripting) {
265        $success = &openoffice_doc_to_html($input_filename, $output_filestem);
266    }
267    else {
268        $success = &doc_to_html($input_filename, $output_filestem);   
269    }
270    if ($success) {
271       return "html";
272    }
273    }
274    return &convertAnything($input_filename, $output_filestem, $output_type);
275}
276
277
278# Convert a Rich Text Format (RTF) file
279
280sub convertRTF {
281    my ($input_filename, $output_filestem, $output_type) = @_;
282
283    my $success = 0;
284
285    # Attempt specialised conversion to HTML
286    if (!$output_type || ($output_type =~ m/html/i)) {
287
288    if ($windows_scripting) {
289        $success = &native_doc_to_html($input_filename, $output_filestem);
290    }
291    elsif ($openoffice_scripting) {
292        $success = &openoffice_doc_to_html($input_filename, $output_filestem);
293    }
294    else {
295        $success = &rtf_to_html($input_filename, $output_filestem);
296    }
297    if ($success) {
298        return "html";
299    }
300    }
301
302# rtf is so ugly that's it's not worth running strings over.
303# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
304#    return &convertAnything($input_filename, $output_filestem, $output_type);
305    return "fail";
306}
307
308
309# Convert an unidentified file
310
311sub convertAnything {
312    my ($input_filename, $output_filestem, $output_type) = @_;
313   
314    my $success = 0;
315 
316    # Attempt simple conversion to HTML
317    if (!$output_type || ($output_type =~ m/html/i)) {
318    $success = &any_to_html($input_filename, $output_filestem);
319    if ($success) {
320        return "html";
321    }
322    }
323
324    # Convert to text
325    if (!$output_type || ($output_type =~ m/text/i)) {
326    $success = &any_to_text($input_filename, $output_filestem);
327    if ($success) {
328        return "text";
329    }
330    }
331    return "fail";
332}
333
334
335
336# Convert an Adobe PDF document
337
338sub convertPDF {
339    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
340
341    my $success = 0;
342    $output_type =~ s/.*\-(.*)/$1/i;
343    # Attempt coversion to Image
344    if ($output_type =~ m/jp?g|gif|png/i) {
345    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
346    if ($success){
347        return "item";
348    }
349    }
350
351    # Attempt conversion to HTML
352    if (!$output_type || ($output_type =~ m/html/i)) {
353    $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
354    if ($success) {
355        return "html";
356    }
357    }
358
359    # Attempt conversion to TEXT
360    if (!$output_type || ($output_type =~ m/text/i)) {
361    $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
362    if ($success) {
363        return "text";
364    }
365    }
366
367    return "fail";
368
369}
370
371
372# Convert an Adobe PostScript document
373
374sub convertPS {
375    my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
376
377    my $success = 0;
378    $output_type =~ s/.*\-(.*)/$1/i;
379    # Attempt coversion to Image
380    if ($output_type =~ m/jp?g|gif|png/i) {
381    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
382    if ($success){
383        return "item";
384    }
385    }
386
387    # Attempt conversion to TEXT
388    if (!$output_type || ($output_type =~ m/text/i)) {
389    $success = &ps_to_text($input_filename, $output_filestem);
390    if ($success) {
391        return "text";
392    }
393    }
394    return "fail";
395}
396
397
398sub convertPPT {
399    my ($input_filename, $output_filestem, $output_type) = @_;
400    my $success = 0;
401
402    my $ppt_convert_type = "";
403
404    if ($openoffice_scripting) {
405    # Jump right in and process with Open Office
406        if (openoffice_doc_to_html($input_filename, $output_filestem)) {
407        return "html";
408    }
409    else {
410        return "fail";
411    }
412    }
413   
414    #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
415    if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
416    if ($output_type =~ m/gif/i) {
417        $ppt_convert_type = "-g";
418    } elsif ($output_type =~ m/jp?g/i){
419        $ppt_convert_type = "-j";
420    } elsif ($output_type =~ m/png/i){
421        $ppt_convert_type = "-p";
422    }
423    my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
424                       $ENV{'GSDLOS'}, "pptextract");
425    $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
426           
427    my $cmd = "";
428    if ($timeout) {$cmd = "ulimit -t $timeout;";}
429    # if the converting directory already exists
430    if (-d $output_filestem) {
431        print STDERR "**The conversion directory already exists\n";
432        return "item";
433    } else {
434        $cmd .=  "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
435        $cmd .= " 2>\"$output_filestem.err\""
436        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
437        if (system($cmd) !=0) {
438        print STDERR "Powerpoint VB Scripting convert failed\n";
439        } else {
440        return "item";
441        }
442    }
443    } elsif (!$output_type || ($output_type =~ m/html/i)) {
444    # Attempt conversion to HTML
445    #if (!$output_type || ($output_type =~ m/html/i)) {
446    # formulate the command
447    my $cmd = "";
448    $cmd .= "perl -S ppttohtml.pl ";
449    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
450    $cmd .= " 2>\"$output_filestem.err\""
451        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
452
453    # execute the command
454    $!=0;
455    if (system($cmd)!=0)
456    {
457        print STDERR "Powerpoint 95/97 converter failed $!\n";
458    } else {
459        return "html";
460    }
461    }
462
463    $success = &any_to_text($input_filename, $output_filestem);
464    if ($success) {
465    return "text";
466    }
467   
468    return "fail";
469}
470
471
472sub convertXLS {
473    my ($input_filename, $output_filestem, $output_type) = @_;
474
475    my $success = 0;
476
477    if ($openoffice_scripting) {
478    # Jump right in and process with Open Office
479        if (openoffice_doc_to_html($input_filename, $output_filestem)) {
480        return "html";
481    }
482    else {
483        return "fail";
484    }
485    }
486   
487    # Attempt conversion to HTML
488    if (!$output_type || ($output_type =~ m/html/i)) {
489    # formulate the command
490    my $cmd = "";
491    $cmd .= "perl -S xlstohtml.pl ";
492    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
493    $cmd .= " 2>\"$output_filestem.err\""
494        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
495   
496   
497    # execute the command
498    $!=0;
499    if (system($cmd)!=0)
500    {
501        print STDERR "Excel 95/97 converter failed $!\n";
502    } else {
503        return "html";
504    }
505    }
506
507    $success = &any_to_text($input_filename, $output_filestem);
508    if ($success) {
509    return "text";
510    }
511
512    return "fail";
513}
514
515
516
517# Find the real type of a .doc file
518#
519# We seem to have a lot of files with a .doc extension that are .rtf
520# files or Word 5 files.  This function attempts to tell the difference.
521sub find_docfile_type {
522    my ($input_filename) = @_;
523   
524    open(CHK, "<$input_filename");
525    binmode(CHK);
526    my $line = "";
527    my $first = 1;
528
529    while (<CHK>) {
530   
531    $line = $_;
532
533    if ($first) {
534        # check to see if this is an rtf file
535        if ($line =~ m/^\{\\rtf/) {
536        close(CHK);
537        return "rtf";
538        }
539        $first = 0;
540    }
541   
542    # is this is a word 6/7/8 document?
543    if ($line =~ m/Word\.Document\.([678])/) {
544        close(CHK);
545        return "word$1";
546    }
547
548    }
549
550    return "unknown";
551}
552
553
554# Specific type-to-type conversions
555#
556# Each of the following functions attempts to convert a document from
557# a specific format to another.  If they succeed they return 1 and leave
558# the output document(s) in the appropriate place; if they fail they
559# return 0 and delete any working files.
560
561
562# Attempt to convert a word document to html with the wv program
563sub doc_to_html {
564    my ($input_filename, $output_filestem) = @_;
565
566    my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
567
568    if ( -d "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv" && $ENV{'GSDLOS'} eq "linux" ) {
569        $ENV{'PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/bin:$ENV{'PATH'}";
570        $ENV{'LD_LIBRARY_PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/lib:$ENV{'LD_LIBRARY_PATH'}";
571        $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv", "bin", "wvWare");
572    }
573
574    # don't include path on windows (to avoid having to play about
575    # with quoting when GSDLHOME might contain spaces) but assume
576    # that the PATH is set up correctly
577    $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
578
579    my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
580                      "packages", "wv", "wvHtml.xml");
581   
582    # Added the following to work with replace_srcdoc_with_html.pl:
583    # Make wvWare put any associated (image) files of the word doc into
584    # folder docname-without-extention_files. This folder should be at
585    # the same level as the html file generated from the doc.
586    # wvWare will take care of proper interlinking.
587
588    # This step is necessary for replace_srcdoc_with_html.pl which will
589    # move the html and associated files into the import folder. We
590    # want to ensure that the associated files won't overwrite similarly
591    # named items already in import. Hence we put them in a folder first
592    # (to which the html links properly) and that will allow
593    # replace_srcdoc_with_html.pl to move them safely to /import.
594
595    # To do all this, we need to use wvWare's --dir and --basename options
596    # where dir is the full path to the image folder directory and
597    # basename is the full path to the image folder appended to the name
598    # which is to be prepended to every image file:
599    # eg. if the images were to have names like sample0.jpg to sampleN.jpg,
600    # then the basename is "/full/path/to/imgdir/sample".
601    # In this case, basename is the full path to and name of the document.
602    # HOWEVER: basename always takes full path, not relative url, so
603    # the greenstone browser is unable to display the images (absolute paths
604    # cause it to give an "external link" message)
605    # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
606    # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
607    # "added --dir option to wvHtml so that pictures can be placed in
608    # a seperate directory"
609    # "running wvWare through IMP to view word documents as html. It gets
610    # invoked like this:
611    # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
612   
613    # toppath is the folder where html is generated
614    # docname is the name (without extension) of the html to be generated
615    # suffix (extension) is thrown away
616    my ($docname, $toppath)
617    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
618
619    # We want the image folder generated to have the same name as windows
620    # would generate ($windows_scripting) when it converts from word to html.
621    # That is, foldername=docname_files
622    my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
623    #print "assoc_dir: ".$assoc_dir."\n";  # same as "$output_filestem._files"
624   
625    # ensure this image directory exists
626    # if it exists already, just delete and recreate
627    if(-e $assoc_dir) {
628    &util::rm_r($assoc_dir);
629    } 
630    &util::mk_dir($assoc_dir);
631
632    # the images are all going to be called image0, image1,..., imageN
633    my $img_basenames = &util::filename_cat($assoc_dir, $docname);
634   
635    #print STDERR "****toppath: $toppath\n****docname: $docname\n;
636    #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
637    #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
638
639    my $cmd = "";
640    if ($timeout) {$cmd = "ulimit -t $timeout;";}
641    # wvWare's --dir and --basename options for image directory.
642    # Replaced the next line with the *2 lines* following it:
643               # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
644    $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
645    $cmd .= " --charset utf-8 --config \"$wv_conf\"";
646    $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
647
648    # redirecting STDERR is a bad idea on windows 95/98
649    $cmd .= " 2> \"$output_filestem.err\""
650    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
651    # execute the command
652    $!=0;
653    if (system($cmd)!=0)
654    {
655    print STDERR "Error executing wv converter:$!\n";
656    if (-s "$output_filestem.err") {
657        open (ERRFILE, "<$output_filestem.err");
658
659        my $write_to_fail_log=0;
660        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
661        {$write_to_fail_log=1;}
662
663        my $line;
664        while ($line=<ERRFILE>) {
665        if ($line =~ m/\w/) {
666            print STDERR "$line";
667            print FAILLOG "$line" if ($write_to_fail_log);
668        }
669        if ($line !~ m/startup error/) {next;}
670        print STDERR " (given an invalid .DOC file?)\n";
671        print FAILLOG " (given an invalid .DOC file?)\n"
672        if ($write_to_fail_log);
673       
674        } # while ERRFILE
675        close FAILLOG if ($write_to_fail_log);
676    }
677    return 0; # we can try any_to_text
678    }
679
680    # Was the conversion successful?
681
682    if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
683    open(TMP, "$output_filestem.html");
684    my $line = <TMP>;
685    close(TMP);
686    if ($line && $line =~ m/DOCTYPE HTML/) {
687        &util::rm("$output_filestem.err") if -e "$output_filestem.err";   
688
689        # Inserted this code to remove the images directory if it was still empty after
690        # the html was generated (in case there were no images in the word document)
691        if (&util::is_dir_empty($assoc_dir)) {
692        #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n";
693        &util::rm_r($assoc_dir);
694        } else { # there was an image folder (it was generated)
695        # Therefore, the html file generated contains absolute links to the images
696        # Replace them with relative links instead, so the folder can be moved elsewhere
697        &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");   
698        }
699        return 1;
700    }
701    }
702   
703    # If here, an error of some sort occurred
704    &util::rm("$output_filestem.html") if -e "$output_filestem.html";
705    if (-e "$output_filestem.err") {
706    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
707        open (ERRLOG,"$output_filestem.err");
708        while (<ERRLOG>) {print FAILLOG $_;}
709        close FAILLOG;
710        close ERRLOG;
711    }
712    &util::rm("$output_filestem.err");
713    }
714   
715    return 0;
716}
717
718# Method to work with doc_to_html - Word docs might contain images.
719# When such word docs are converted with wvWare, we make it generate a
720# <filename>_files folder with the associated images, while the html file
721# <filename> refers to the images using absolute paths to <filename>_files.
722# This method reads in that html file and replaces all the absolute paths to
723# the images in <filename>_files with the relative paths to the images from
724# that folder. (I.e. with <filename>_files/<imagename.ext>).
725sub make_links_to_assocdir_relative{
726    # toppath is the top-level folder in which the html file we're going to be fixing resides
727    # docname is just the name (without extension) of the html file
728    # html_file is the full path to the html file: /full/path/docname.html
729    # assoc_dir_path is toppath/docname_files
730    # assoc_dirname is the directory name of the folder with associated imgs: docname_files
731    my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
732
733    # 1. Read all the contents of the html into a string
734    # open the original file for reading
735    unless(open(FIN, "<$html_file")) {
736    print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
737    return 0;
738    }
739    # From http://perl.plover.com/local.html
740    # "It's cheaper to read the file all at once, without all the splitting and reassembling.
741    # (Some people call this slurping the file.) Perl has a special feature to support this:
742    # If the $/ variable is undefined, the <...> operator will read the entire file all at once"
743    my $html_contents;
744    {
745    local $/ = undef;        # Read entire file at once
746    $html_contents = <FIN>;  # Now file is read in as one single 'line'
747    }
748    close(FIN); # close the file
749    #print STDERR $html_contents;
750   
751    # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src
752    # values with assoc_dirname
753    # At the end: g means substitute all occurrences (global), while s at the end means treat
754    # all new lines as a regular space. This interacts with g to consider all the lines
755    # together as a single line so that multi-occurrences can be replaced.
756
757    # we can't just replace $assoc_dir_path with $assoc_dir
758    # $assoc_dir_path represents a regular expression that needs to be replaced
759    # if it contains ., -, [, ], or Windows style backslashes in paths  -- which all have special
760    # meaning in Perl regular expressions -- we need to escape these first
761    my $safe_reg_expression = $assoc_dir_path;
762    $safe_reg_expression =~ s/\\/\\\\/g;
763    $safe_reg_expression =~ s/\./\\./g;
764    $safe_reg_expression =~ s/\-/\\-/g;
765    $safe_reg_expression =~ s/\[/\\[/g;
766    $safe_reg_expression =~ s/\]/\\]/g;
767    $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
768
769    # The following regular expression substitution looks for <a or <image, followed by any other
770    # attributes and values until it comes to the FIRST (indicated by ?) href= or src=
771    # followed by " or ' no quotes at all around path, followed by the associated folder's pathname
772    # followed by characters (for the img filename), then finally the optional closing quotes
773    # in " or ' form, followed by any other attributes and values until the first > to end the tag.
774    # The substitution: all the parts preceding associated folder's pathname are retained,
775    # the associated folder path name is replaced by associated folder directory name
776    # and the rest upto and including the closing > tag is retained.
777    # The sg at the end of the pattern match treats all of html_contents as a single line (s)
778    # and performs a global replace (g) meaning that all occurrences that match in that single line
779    # are substituted.
780    $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg;
781               #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
782    # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
783    $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
784
785    #print STDERR "****assoc_dirname: $assoc_dirname***\n";
786    #print STDERR "****safe_reg_expression: $safe_reg_expression***\n";
787   
788    # delete the original file and recreate it
789    my $copy_of_filename = $html_file;
790    &util::rm($copy_of_filename); # deleted the file
791
792    # Recreate the original file for writing the updated contents
793    unless(open(FOUT, ">$html_file")) {  # open it as a new file for writing
794    print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
795    return 0;
796    }
797
798    # write out the updated contents and close the file
799    print FOUT $html_contents;
800    close(FOUT);
801    return 1;
802}
803
804# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
805# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
806# introduced in link pathnames by wvWare into space again. Converts all percent signs
807# introduced by URL encoding filenames generated into %25 in these url links referencing them
808sub post_process_assocfile_urls
809{
810    my ($pre, $text, $post) = @_;
811
812    $text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
813    # $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
814    $text =~ s/\\/\//g;
815    $text =~ s/%/%25/g;
816
817    return "$pre$text$post";
818}
819
820# Attempt to convert a word document to html with the word2html scripting program
821sub native_doc_to_html {
822    my ($input_filename, $output_filestem) = @_;
823
824    my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
825                       $ENV{'GSDLOS'}, "word2html");
826
827    $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
828    if (-e "$output_filestem.html") {
829    print STDERR "    The conversion file:\n";
830    print STDERR "      $output_filestem.html\n";
831    print STDERR "    ... already exists.  Skipping\n";
832    return 1;
833    }
834
835    my $cmd = "";
836    if ($timeout) {$cmd = "ulimit -t $timeout;";}
837    #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
838    #$cmd .=  "$vbScript $input_filename $output_filestem.html";
839    $cmd .=  "$vbScript \"$input_filename\" \"$output_filestem.html\"";
840
841    # redirecting STDERR
842    $cmd .= " 2> \"$output_filestem.err\""
843    if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
844   
845    # execute the command
846    $!=0;
847    if (system($cmd)!=0)
848    {
849    print STDERR "Error executing word2Html converter:$!\n";
850    if (-s "$output_filestem.err") {
851        open (ERRFILE, "<$output_filestem.err");
852       
853        my $write_to_fail_log=0;
854        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
855        {$write_to_fail_log=1;}
856
857        my $line;
858        while ($line=<ERRFILE>) {
859        if ($line =~ m/\w/) {
860            print STDERR "$line";
861            print FAILLOG "$line" if ($write_to_fail_log);
862        }
863        if ($line !~ m/startup error/) {next;}
864        print STDERR " (given an invalid .DOC file?)\n";
865        print FAILLOG " (given an invalid .DOC file?)\n"
866        if ($write_to_fail_log);
867       
868        } # while ERRFILE
869        close FAILLOG if ($write_to_fail_log);
870    }
871    return 0; # we can try any_to_text
872    }
873
874    # Was the conversion successful?
875    if (-s "$output_filestem.html") {
876    open(TMP, "$output_filestem.html");
877    my $line = <TMP>;
878    close(TMP);
879    if ($line && $line =~ m/html/i) {
880        &util::rm("$output_filestem.err") if -e "$output_filestem.err";
881        return 1;
882    }
883    }
884   
885    # If here, an error of some sort occurred
886    &util::rm("$output_filestem.html") if -e "$output_filestem.html";
887    if (-e "$output_filestem.err") {
888    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
889        open (ERRLOG,"$output_filestem.err");
890        while (<ERRLOG>) {print FAILLOG $_;}
891        close FAILLOG;
892        close ERRLOG;
893    }
894    &util::rm("$output_filestem.err");
895    }
896    return 0;
897}
898
899# Attempt to convert a word document to html with JODConvert scripting program
900sub openoffice_doc_to_html {
901    my ($input_filename, $output_filestem) = @_;
902
903    if (-e "$output_filestem.html") {
904    print STDERR "    The conversion file:\n";
905    print STDERR "      $output_filestem.html\n";
906    print STDERR "    ... skipping\n";
907    return 1;
908    }
909
910    my $oo_script_dir = &util::filename_cat($ENV{'GEXT_OPENOFFICE'}, "bin", "script");
911    my $oo2html = &util::filename_cat($oo_script_dir,"oo2html");
912    if (!-e $oo2html) {
913    print STDERR "Error: Unable to find 'oo2html' in: \n";
914    print STDERR "       $oo_script_dir\n";
915    print STDERR "       Is the OpenOffice extension to Greenstone installed?\n";
916    return 0;
917    }
918
919    my $cmd = "";
920    if ($timeout) {$cmd = "ulimit -t $timeout;";}
921    $cmd .=  "$oo2html \"$input_filename\" \"$output_filestem.html\"";
922
923    # redirecting STDERR
924    $cmd .= " 2> \"$output_filestem.err\""
925    if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
926   
927    # execute the command
928    $!=0;
929    if (system($cmd)!=0)
930    {
931    print STDERR "Error executing oo2html converter: $!\n";
932    print STDERR "Command was: $cmd\n";
933
934    if (-s "$output_filestem.err") {
935        open (ERRFILE, "<$output_filestem.err");
936       
937        my $write_to_fail_log=0;
938        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
939        {$write_to_fail_log=1;}
940
941        my $line;
942        while ($line=<ERRFILE>) {
943        if ($line =~ m/\w/) {
944            print STDERR "$line";
945            print FAILLOG "$line" if ($write_to_fail_log);
946        }
947        if ($line !~ m/startup error/) {next;}
948        print STDERR " (given an invalid .DOC file?)\n";
949        print FAILLOG " (given an invalid .DOC file?)\n"
950        if ($write_to_fail_log);
951       
952        } # while ERRFILE
953        close FAILLOG if ($write_to_fail_log);
954    }
955    return 0; # we can try any_to_text
956    }
957
958    # Was the conversion successful?
959    if (-s "$output_filestem.html") {
960    open(TMP, "$output_filestem.html");
961    my $line = <TMP>;
962    close(TMP);
963    if ($line && $line =~ m/html/i) {
964        &util::rm("$output_filestem.err") if -e "$output_filestem.err";
965        return 1;
966    }
967    }
968   
969    # If here, an error of some sort occurred
970   
971    &util::rm("$output_filestem.html") if -e "$output_filestem.html";
972    if (-e "$output_filestem.err") {
973    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
974        open (ERRLOG,"$output_filestem.err");
975        while (<ERRLOG>) {print FAILLOG $_;}
976        close FAILLOG;
977        close ERRLOG;
978    }
979    &util::rm("$output_filestem.err");
980    }
981    return 0;
982}
983
984# Attempt to convert an RTF document to html with rtftohtml
985sub rtf_to_html {
986    my ($input_filename, $output_filestem) = @_;
987
988    # formulate the command
989    my $cmd = "";
990    if ($timeout) {$cmd = "ulimit -t $timeout;";}
991    $cmd .= "rtftohtml";
992    #$cmd .= "rtf-converter";
993
994    $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
995
996    $cmd .= " 2>\"$output_filestem.err\""
997        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
998
999
1000    # execute the command
1001    $!=0;
1002    if (system($cmd)!=0)
1003    {
1004    print STDERR "Error executing rtf converter $!\n";
1005    # don't currently bother printing out error log...
1006    # keep going, in case it still created an HTML file...
1007    }
1008
1009    # Was the conversion successful?
1010    my $was_successful=0;
1011    if (-s "$output_filestem.html") {
1012    # make sure we have some content other than header
1013    open (HTML, "$output_filestem.html"); # what to do if fail?
1014    my $line;
1015    my $past_header=0;
1016    while ($line=<HTML>) {
1017
1018        if ($past_header == 0) {
1019        if ($line =~ m/<body>/) {$past_header=1;}
1020        next;
1021        }
1022
1023        $line =~ s/<[^>]+>//g;
1024        if ($line =~ m/\w/ && $past_header) {  # we found some content...
1025        $was_successful=1;
1026        last;
1027        }
1028    }
1029    close HTML;
1030    }
1031
1032    if ($was_successful) {
1033    &util::rm("$output_filestem.err")
1034        if (-e "$output_filestem.err");
1035    # insert the (modified) table of contents, if it exists.
1036    if (-e "${output_filestem}_ToC.html") {
1037        &util::mv("$output_filestem.html","$output_filestem.src");
1038        my $open_failed=0;
1039        open HTMLSRC, "$output_filestem.src" || ++$open_failed;
1040        open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
1041        open HTML, ">$output_filestem.html" || ++$open_failed;
1042       
1043        if ($open_failed) {
1044        close HTMLSRC;
1045        close TOC;
1046        close HTML;
1047        &util::mv("$output_filestem.src","$output_filestem.html");
1048        return 1;
1049        }
1050
1051        # print out header info from src html.
1052        while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
1053        print HTML "$_";
1054        }
1055
1056        # print out table of contents, making links relative
1057        <TOC>; <TOC>; # ignore first 2 lines
1058        print HTML scalar(<TOC>); # line 3 = "<ol>\n"
1059        my $line;
1060        while ($line=<TOC>) {
1061        $line =~ s@</body></html>$@@i ; # only last line has this
1062        # make link relative
1063        $line =~ s@href=\"[^\#]+@href=\"@i;
1064        print HTML $line;
1065        }
1066        close TOC;
1067
1068        # rest of html src
1069        while (<HTMLSRC>) {
1070        print HTML $_;
1071        }
1072        close HTMLSRC;
1073        close HTML;
1074
1075        &util::rm("${output_filestem}_ToC.html");
1076        &util::rm("${output_filestem}.src");
1077    }
1078    # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
1079    return 1; # success
1080    }
1081
1082    if (-e "$output_filestem.err") {
1083    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1084    {
1085        print FAILLOG "Error - rtftohtml - couldn't extract text\n";
1086        #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
1087        print FAILLOG " (rtf file might be too recent):\n";
1088        open (ERRLOG, "$output_filestem.err");
1089        while (<ERRLOG>) {print FAILLOG $_;}
1090        close ERRLOG;
1091        close FAILLOG;
1092    }
1093    &util::rm("$output_filestem.err");
1094    }
1095
1096    &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1097
1098    return 0;
1099}
1100
1101
1102# Convert a pdf file to html with the pdftohtml command
1103
1104sub pdf_to_html {
1105    my ($dirname, $input_filename, $output_filestem) = @_;
1106
1107    my $cmd = "";
1108    if ($timeout) {$cmd = "ulimit -t $timeout;";}
1109    $cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
1110    $cmd .= " -c" if ($pdf_complex);
1111    $cmd .= " -i" if ($pdf_ignore_images);
1112    $cmd .= " -a" if ($pdf_allow_images_only);
1113    $cmd .= " -hidden" unless ($pdf_nohidden);
1114    $cmd .= " \"$input_filename\" \"$output_filestem\"";
1115   
1116    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
1117    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1118    } else {
1119    $cmd .= " > \"$output_filestem.err\"";
1120    }
1121
1122    $!=0;
1123
1124    my $retval=system($cmd);
1125    if ($retval!=0)
1126    {
1127    print STDERR "Error executing pdftohtml.pl";
1128    if ($!) {print STDERR ": $!";}
1129    print STDERR "\n";
1130    }
1131
1132    # make sure the converter made something
1133    if ($retval!=0 || ! -s "$output_filestem.html")
1134    {
1135    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1136    # print out the converter's std err, if any
1137    if (-s "$output_filestem.err") {
1138        open (ERRLOG, "$output_filestem.err") || die "$!";
1139        print STDERR "pdftohtml error log:\n";
1140        while (<ERRLOG>) {
1141        print STDERR "$_";
1142        }
1143        close ERRLOG;
1144    }
1145    print STDERR "***********output filestem $output_filestem.html\n";
1146    &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1147    if (-e "$output_filestem.err") {
1148        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1149        {
1150        open (ERRLOG, "$output_filestem.err");
1151        while (<ERRLOG>) {print FAILLOG $_;}
1152        close ERRLOG;
1153        close FAILLOG;
1154        }   
1155        &util::rm("$output_filestem.err");
1156    }
1157    return 0;
1158    }
1159
1160    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1161    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1162    return 1;
1163}
1164
1165# Convert a pdf file to various types of image with the convert command
1166
1167sub pdfps_to_img {
1168    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
1169
1170    # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
1171    if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
1172    my $result = `identify 2>&1`;
1173    if ($? == -1 || $? == 256) {  # Linux and Windows return different values for "program not found"
1174        #ImageMagick is not installed, thus the convert utility is not available.
1175        print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
1176        return 0;
1177    }
1178    }
1179
1180    my $cmd = "";
1181    if ($timeout) {$cmd = "ulimit -t $timeout;";}
1182    $output_type =~ s/.*\_(.*)/$1/i;
1183    $cmd .= "perl -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
1184    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
1185    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1186    } else {
1187    $cmd .= " > \"$output_filestem.err\"";
1188    }
1189
1190    # don't include path on windows (to avoid having to play about
1191    # with quoting when GSDLHOME might contain spaces) but assume
1192    # that the PATH is set up correctly
1193    $!=0;
1194    my $retval=system($cmd);
1195    if ($retval!=0)
1196    {
1197    print STDERR "Error executing pdftoimg.pl";
1198    if ($!) {print STDERR ": $!";}
1199    print STDERR "\n";
1200    }
1201
1202    #make sure the converter made something
1203    #if ($retval !=0) || ! -s "$output_filestem")
1204    if ($retval !=0)
1205    {
1206    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1207    #print out the converter's std err, if any
1208    if (-s "$output_filestem.err") {
1209        open (ERRLOG, "$output_filestem.err") || die "$!";
1210        print STDERR "pdfpstoimg error log:\n";
1211        while (<ERRLOG>) {
1212        print STDERR "$_";
1213        }
1214        close ERRLOG;
1215    }
1216    #&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1217    if (-e "$output_filestem.err") {
1218        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1219        {
1220        open (ERRLOG, "$output_filestem.err");
1221        while (<ERRLOG>) {print FAILLOG $_;}
1222        close ERRLOG;
1223        close FAILLOG;
1224       }   
1225        &util::rm("$output_filestem.err");
1226    }
1227    return 0;
1228    }
1229    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1230    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1231    return 1;
1232}
1233
1234# Convert a PDF file to text with the pdftotext command
1235
1236sub pdf_to_text {
1237    my ($dirname, $input_filename, $output_filestem) = @_;
1238
1239    my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1240
1241    if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1242    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1243    } else {
1244    $cmd .= " > \"$output_filestem.err\"";
1245    }
1246   
1247    if (system($cmd)!=0)
1248    {
1249    print STDERR "Error executing $cmd: $!\n";
1250    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1251    }
1252
1253    # make sure there is some extracted text.
1254    if (-e "$output_filestem.text") {
1255    open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
1256    binmode(EXTR_TEXT); # just in case...
1257    my $line="";
1258    my $seen_text=0;
1259    while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1260        if ($line=~ m/\w/) {$seen_text=1;}
1261    }
1262    close EXTR_TEXT;
1263    if ($seen_text==0) { # no text was extracted
1264        print STDERR "Error: pdftotext found no text\n";
1265        &util::rm("$output_filestem.text");
1266    }
1267    }
1268
1269    # make sure the converter made something
1270    if (! -s "$output_filestem.text")
1271    {
1272    # print out the converters std err, if any
1273    if (-s "$output_filestem.err") {
1274        open (ERRLOG, "$output_filestem.err") || die "$!";
1275        print STDERR "pdftotext error log:\n";
1276        while (<ERRLOG>) {
1277        print STDERR "$_";
1278        }
1279        close ERRLOG;
1280    }
1281    # does this converter create a .out file?
1282    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1283    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1284    if (-e "$output_filestem.err") {
1285        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1286        {
1287        open (ERRLOG,"$output_filestem.err");
1288        while (<ERRLOG>) {print FAILLOG $_;}
1289        close ERRLOG;
1290        close FAILLOG;
1291        }
1292        &util::rm("$output_filestem.err");
1293    }
1294    return 0;
1295    }
1296    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1297    return 1;
1298}
1299
1300# Convert a PostScript document to text
1301# note - just using "ps2ascii" isn't good enough, as it
1302# returns 0 for a postscript interpreter error. ps2ascii is just
1303# a wrapper to "gs" anyway, so we use that cmd here.
1304
1305sub ps_to_text {
1306    my ($input_filename, $output_filestem) = @_;
1307
1308    my $error = "";
1309
1310    # if we're on windows we'll fall straight through without attempting
1311    # to use gs
1312    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1313    $error = "Windows does not support gs";
1314
1315    } else {
1316    my $cmd = "";
1317    if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1318    $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1319    $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1320    #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1321    $cmd .= " 2> $output_filestem.err";
1322    $!=0;
1323
1324    my $retcode=system($cmd);
1325    $retcode = $? >> 8;  # see man perlfunc - system for this...
1326    # if system returns -1 | 127 (couldn't start program), look at $! for message
1327
1328    if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1329    elsif (! -e "$output_filestem.text") {
1330        $error="did not create output file.\n";
1331    }
1332    else
1333    {   # make sure the interpreter didn't get an error. It is technically
1334        # possible for the actual text to start with this, but....
1335        open PSOUT, "$output_filestem.text";
1336        if (<PSOUT> =~ m/^Error: (.*)/) {
1337        $error="interpreter error - \"$1\"";
1338        }
1339        close PSOUT;
1340    }
1341    }
1342
1343    if ($error ne "")
1344    {
1345    print STDERR "Warning: Error executing gs: $error\n";
1346    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1347
1348    if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1349    {
1350        print FAILLOG "gs - $error\n";
1351        if (-e "$output_filestem.err") {
1352        open(ERRLOG, "$output_filestem.err");
1353        while (<ERRLOG>) {print FAILLOG $_;}
1354        close ERRLOG;
1355        }
1356        close FAILLOG;
1357    }
1358    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1359
1360
1361    # Fine then. We'll just do a lousy job by ourselves...
1362    # Based on 5-line regexp sed script found at:
1363    # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1364    #
1365    print STDERR "Stripping text from postscript\n";
1366    my $errorcode=0;
1367    open (IN, "$input_filename")
1368        ||  ($errorcode=1, warn "Couldn't read file: $!");
1369    open (OUT, ">$output_filestem.text")
1370        ||  ($errorcode=1, warn "Couldn't write file: $!");
1371    if ($errorcode) {print STDERR "errors\n";return 0;}
1372   
1373    my $text="";  # this is for whole .ps file...
1374    $text = join('', <IN>); # see man perlport, under "System Resources"
1375    close IN;
1376
1377    # Make sure this is a ps file...
1378    if ($text !~ m/^%!/) {
1379        print STDERR "Bad postscript header: not '%!'\n";
1380        if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1381        {
1382        print FAILLOG "Bad postscript header: not '%!'\n";
1383        close FAILLOG;
1384        }
1385        return 0;
1386    }
1387
1388    # if ps has Page data, then use it to delete all stuff before it.
1389    $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1390   
1391    # remove all leading non-data stuff
1392    $text =~ s/^.*?\(//s;
1393
1394    # remove all newline chars for easier processing
1395    $text =~ s/\n//g;
1396   
1397    # Big assumption here - assume that if any co-ordinates are
1398    # given, then we are at the end of a sentence.
1399    $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1400
1401    # special characters--
1402    $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1403
1404    # ? ps text formatting (eg italics?) ?
1405    $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1406    $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1407    $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1408    # default - remove the rest
1409    $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1410
1411    # attempt to add whitespace between words...
1412    # this is based purely on observation, and may be completely wrong...
1413    $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1414    # eg I notice "b(" is sometimes NOT a space if preceded by a
1415    # negative number.
1416    $text =~ s/\)\d+ ?b\(/\) \( /g;
1417
1418    # change quoted braces to brackets
1419    $text =~ s/([^\\])\\\(/$1\{/g;
1420    $text =~ s/([^\\])\\\)/$1\}/g ;
1421
1422    # remove everything that is not between braces
1423    $text =~ s/\)([^\(\)])+?\(//sg ;
1424   
1425    # remove any Trailer eof stuff.
1426    $text =~ s/\)[^\)]*$//sg;
1427
1428    ### ligatures have special characters...
1429    $text =~ s/\\013/ff/g;
1430    $text =~ s/\\014/fi/g;
1431    $text =~ s/\\015/fl/g;
1432    $text =~ s/\\016/ffi/g;
1433    $text =~ s/\\214/fi/g;
1434    $text =~ s/\\215/fl/g;
1435    $text =~ s/\\017/\n\* /g; # asterisk?
1436    $text =~ s/\\023/\023/g;  # e acute ('e)
1437    $text =~ s/\\177/\252/g;  # u"
1438#   $text =~ s/ ?? /\344/g;  # a"
1439
1440    print OUT "$text";
1441    close OUT;
1442    }
1443    # wrap the text - use a minimum length. ie, first space after this length.
1444    my $wrap_length=72;
1445    &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1446    open INFILE, "$output_filestem.text.tmp" ||
1447    die "Couldn't open file: $!";
1448    open OUTFILE, ">$output_filestem.text" ||
1449    die "Couldn't open file for writing: $!";
1450    my $line="";
1451    while ($line=<INFILE>) {
1452    while (length($line)>0) {
1453        if (length($line)>$wrap_length) {
1454        $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1455        print OUTFILE "$1\n";
1456        } else {
1457        print OUTFILE "$line";
1458        $line="";
1459        }
1460    }
1461    }
1462    close INFILE;
1463    close OUTFILE;
1464    &util::rm("$output_filestem.text.tmp");
1465
1466    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1467    return 1;
1468}
1469
1470
1471# Convert any file to HTML with a crude perl implementation of the
1472# UNIX strings command.
1473
1474sub any_to_html {
1475    my ($input_filename, $output_filestem) = @_;
1476
1477    # First generate a text file
1478    return 0 unless (&any_to_text($input_filename, $output_filestem));
1479
1480    # create an HTML file from the text file
1481    open(TEXT, "<$output_filestem.text");
1482    open(HTML, ">$output_filestem.html");
1483
1484    print HTML "<html><head>\n";
1485    print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1486    print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1487    print HTML "</head><body>\n\n";
1488
1489    my $line;
1490    while ($line=<TEXT>) {
1491    $line =~ s/</&lt;/g;
1492    $line =~ s/>/&gt;/g;
1493    if ($line =~ m/^\s*$/) {
1494        print HTML "<p>";
1495    } else {
1496        print HTML "<br> ", $line;
1497    }
1498    }
1499    print HTML "\n</body></html>\n";
1500
1501    close HTML;
1502    close TEXT;
1503
1504    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1505    return 1;
1506}
1507
1508# Convert any file to TEXT with a crude perl implementation of the
1509# UNIX strings command.
1510# Note - this assumes ascii charsets :(     (jrm21)
1511
1512sub any_to_text {
1513    my ($input_filename, $output_filestem) = @_;
1514
1515    if (!$use_strings) {
1516      return 0;
1517    }
1518
1519    print STDERR "\n**** In any to text****\n\n";
1520    open(IN, "<$input_filename") || return 0;
1521    binmode(IN);
1522    open(OUT, ">$output_filestem.text") || return 0;
1523
1524    my ($line);
1525    my $output_line_count = 0;
1526    while (<IN>) {
1527    $line = $_;
1528
1529    # delete anything that isn't a printable character
1530    $line =~ s/[^\040-\176]+/\n/sg;
1531
1532    # delete any string less than 10 characters long
1533    $line =~ s/^.{0,9}$/\n/mg;
1534    while ($line =~ m/^.{1,9}$/m) {
1535        $line =~ s/^.{0,9}$/\n/mg;
1536        $line =~ s/\n+/\n/sg;
1537    }
1538
1539    # remove extraneous whitespace
1540    $line =~ s/\n+/\n/gs;
1541    $line =~ s/^\n//gs;
1542
1543    # output whatever is left
1544    if ($line =~ m/[^\n ]/) {
1545        print OUT $line;
1546        ++$output_line_count;
1547    }
1548    }
1549
1550    close OUT;
1551    close IN;
1552
1553    if ($output_line_count) { # try to protect against binary only formats
1554    return 1;
1555    }
1556
1557    &util::rm("$output_filestem.text");
1558    return 0;
1559
1560}
Note: See TracBrowser for help on using the browser.