root/gsdl/trunk/bin/script/gsConvert.pl @ 18282

Revision 18282, 43.9 KB (checked in by ak19, 10 years ago)

Spaces in filenames are replaced with underscores just to be on the safe side. Tested that files with spaces in their names still work when using the Remote GS server and also work in the local case (such as mp3 and wmv files).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs.  The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48    unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use parsargv;
52use util;
53use Cwd;
54use File::Basename;
55
56# Are we running on WinNT or Win2000 (or later)?
57my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
58if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
59
60my $use_strings;
61my $pdf_complex;
62my $pdf_nohidden;
63my $pdf_zoom;
64my $pdf_ignore_images;
65my $pdf_allow_images_only;
66my $windows_scripting;
67
68sub print_usage
69{
70    print STDERR "\n";
71    print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
72    print STDERR "              or text using third-party programs.\n\n";
73    print STDERR "  usage: $0 [options] filename\n";
74    print STDERR "  options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
75    print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
76    print STDERR "\t-output\tauto|html|text|pagedimage_jpg|pagedimage_gif|pagedimage_png\t(output file type)\n";
77    print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
78    print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
79    print STDERR "\t-windows_scripting\tuse windows script when converting Microsoft Word and PPT via VB script\n";
80    print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
81    print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
82    print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
83    print STDERR "\t\tconverting PDF to HTML\n";
84    print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
85    print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
86    print STDERR "\t\t-pdf_complex is set\n";
87    exit(1);
88}
89
90my $faillogfile="";
91my $timeout=0;
92
93sub main
94{
95    my (@ARGV) = @_;
96    my ($input_type,$output_type,$verbose);
97
98    # read command-line arguments
99    if (!parsargv::parse(\@ARGV,
100             'type/(doc|dot|pdf|ps|ppt|rtf|xls)/', \$input_type,
101             '/errlog/.*/', \$faillogfile,
102             'output/(auto|html|text|pagedimage).*/', \$output_type,
103             'timeout/\d+/0',\$timeout,
104             'verbose/\d+/0', \$verbose,
105             'use_strings', \$use_strings,
106             'windows_scripting',\$windows_scripting,
107             'pdf_complex', \$pdf_complex,
108             'pdf_ignore_images', \$pdf_ignore_images,
109             'pdf_allow_images_only', \$pdf_allow_images_only,
110             'pdf_nohidden', \$pdf_nohidden,
111             'pdf_zoom/\d+/2', \$pdf_zoom
112             ))
113    {
114    print_usage();
115    }
116     
117    # Make sure the input file exists and can be opened for reading
118    if (scalar(@ARGV!=1)) {
119    print_usage();
120    }
121
122    my $input_filename = $ARGV[0];
123    if (!-r $input_filename) {
124    print STDERR "Error: unable to open $input_filename for reading\n";
125    exit(1);
126    }
127
128    # Deduce filenames
129    my ($tailname,$dirname,$suffix)
130    = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
131    my $output_filestem = &util::filename_cat($dirname, "$tailname");
132
133    if ($input_type eq "")
134    {
135    $input_type = lc (substr($suffix,1,length($suffix)-1));
136    }
137   
138    # Change to temporary working directory
139    my $stored_dir = cwd();
140    chdir ($dirname) || die "Unable to change to directory $dirname";
141
142    # Select convert utility
143    if (!defined $input_type) {
144    print STDERR "Error: No filename extension or input type defined\n";
145    exit(1);
146    }
147    elsif ($input_type eq "doc" || $input_type eq "dot") {
148    print &convertDOC($input_filename, $output_filestem, $output_type);
149    print "\n";
150    }
151    elsif ($input_type eq "rtf") {
152    print &convertRTF($input_filename, $output_filestem, $output_type);
153    print "\n";
154    }
155    elsif ($input_type eq "pdf") {
156    print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
157    print "\n";
158    }
159    elsif ($input_type eq "ps") {
160    print &convertPS($input_filename, $output_filestem, $output_type);
161    print "\n";
162    }
163    elsif ($input_type eq "ppt") {
164    print &convertPPT($input_filename, $output_filestem, $output_type);
165    print "\n";
166    }
167    elsif ($input_type eq "xls") {
168    print &convertXLS($input_filename, $output_filestem, $output_type);
169    print "\n";
170    }
171    else {
172    print STDERR "Error: Unable to convert type '$input_type'\n";
173    exit(1);
174    }
175   
176    # restore to original working directory
177    chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
178
179}
180
181&main(@ARGV);
182
183
184
185# Document-type conversion functions
186#
187# The following functions attempt to convert documents from their
188# input type to the specified output type.  If no output type was
189# given, then they first attempt HTML, and then TEXT.
190#
191# Each returns the output type ("html" or "text") or "fail" if no
192# conversion is possible.
193
194# Convert a Microsoft word document
195
196sub convertDOC {
197    ($input_filename, $output_filestem, $output_type) = @_;
198
199    # Many .doc files are not in fact word documents!
200    my $realtype = &find_docfile_type($input_filename);
201
202    if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
203    return &convertWord678($input_filename, $output_filestem, $output_type);
204    } elsif ($realtype eq "rtf") {
205    return &convertRTF($input_filename, $output_filestem, $output_type);
206    } else {
207    return &convertAnything($input_filename, $output_filestem, $output_type);
208    }
209}
210
211# Convert a Microsoft word 6/7/8 document
212
213sub convertWord678 {
214    ($input_filename, $output_filestem, $output_type) = @_;
215
216    my $success = 0;
217    if (!$output_type || ($output_type =~ m/html/i)){
218    if ($windows_scripting) {
219        $success = &native_doc_to_html($input_filename, $output_filestem);
220    }
221    else {
222        $success = &doc_to_html($input_filename, $output_filestem);   
223    }
224    if ($success) {
225       return "html";
226    }
227    }
228    return &convertAnything($input_filename, $output_filestem, $output_type);
229}
230
231
232# Convert a Rich Text Format (RTF) file
233
234sub convertRTF {
235    ($input_filename, $output_filestem, $output_type) = @_;
236
237    my $success = 0;
238
239    # Attempt specialised conversion to HTML
240    if (!$output_type || ($output_type =~ m/html/i)) {
241
242    if ($windows_scripting) {
243        $success = &native_doc_to_html($input_filename, $output_filestem);
244    }
245    else {
246        $success = &rtf_to_html($input_filename, $output_filestem);
247    }
248    if ($success) {
249        return "html";
250    }
251    }
252
253# rtf is so ugly that's it's not worth running strings over.
254# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
255#    return &convertAnything($input_filename, $output_filestem, $output_type);
256    return "fail";
257}
258
259
260# Convert an unidentified file
261
262sub convertAnything {
263    ($input_filename, $output_filestem, $output_type) = @_;
264   
265    my $success = 0;
266 
267    # Attempt simple conversion to HTML
268    if (!$output_type || ($output_type =~ m/html/i)) {
269    $success = &any_to_html($input_filename, $output_filestem);
270    if ($success) {
271        return "html";
272    }
273    }
274
275    # Convert to text
276    if (!$output_type || ($output_type =~ m/text/i)) {
277    $success = &any_to_text($input_filename, $output_filestem);
278    if ($success) {
279        return "text";
280    }
281    }
282    return "fail";
283}
284
285
286
287# Convert an Adobe PDF document
288
289sub convertPDF {
290    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
291
292    my $success = 0;
293    $output_type =~ s/.*\-(.*)/$1/i;
294    # Attempt coversion to Image
295    if ($output_type =~ m/jp?g|gif|png/i) {
296    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
297    if ($success){
298        return "item";
299    }
300    }
301
302    # Attempt conversion to HTML
303    if (!$output_type || ($output_type =~ m/html/i)) {
304    $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
305    if ($success) {
306        return "html";
307    }
308    }
309
310    # Attempt conversion to TEXT
311    if (!$output_type || ($output_type =~ m/text/i)) {
312    $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
313    if ($success) {
314        return "text";
315    }
316    }
317
318    return "fail";
319
320}
321
322
323# Convert an Adobe PostScript document
324
325sub convertPS {
326    ($input_filename, $output_filestem, $output_type) = @_;
327
328    my $success = 0;
329    $output_type =~ s/.*\-(.*)/$1/i;
330    # Attempt coversion to Image
331    if ($output_type =~ m/jp?g|gif|png/i) {
332    $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
333    if ($success){
334        return "item";
335    }
336    }
337
338    # Attempt conversion to TEXT
339    if (!$output_type || ($output_type =~ m/text/i)) {
340    $success = &ps_to_text($input_filename, $output_filestem);
341    if ($success) {
342        return "text";
343    }
344    }
345    return "fail";
346}
347
348
349sub convertPPT {
350    my ($input_filename, $output_filestem, $output_type) = @_;
351    my $success = 0;
352
353    my $ppt_convert_type = "";
354    #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
355    if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
356    if ($output_type =~ m/gif/i) {
357        $ppt_convert_type = "-g";
358    } elsif ($output_type =~ m/jp?g/i){
359        $ppt_convert_type = "-j";
360    } elsif ($output_type =~ m/png/i){
361        $ppt_convert_type = "-p";
362    }
363    my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
364                       $ENV{'GSDLOS'}, "pptextract");
365    $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
366           
367    $cmd = "";
368    if ($timeout) {$cmd = "ulimit -t $timeout;";}
369    # if the converting directory has already existed
370    if (-d $output_filestem) {
371        print STDERR "**The conversion directory has existed\n";
372        return "item";
373    } else {
374        $cmd .=  "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
375        $cmd .= " 2>\"$output_filestem.err\""
376        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
377        if (system($cmd) !=0) {
378        print STDERR "Powerpoint VB Scripting convert failed\n";
379        } else {
380        return "item";
381        }
382    }
383    } elsif (!$output_type || ($output_type =~ m/html/i)) {
384    # Attempt conversion to HTML
385    #if (!$output_type || ($output_type =~ m/html/i)) {
386    # formulate the command
387    $cmd = "";
388    $cmd .= "perl -S ppttohtml.pl ";
389    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
390    $cmd .= " 2>\"$output_filestem.err\""
391        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
392
393    # execute the command
394    $!=0;
395    if (system($cmd)!=0)
396    {
397        print STDERR "Powerpoint 95/97 converter failed $!\n";
398    } else {
399        return "html";
400    }
401    }
402
403    $success = &any_to_text($input_filename, $output_filestem);
404    if ($success) {
405    return "text";
406    }
407   
408    return "fail";
409}
410
411
412sub convertXLS {
413    my ($input_filename, $output_filestem, $output_type) = @_;
414
415    my $success = 0;
416
417    # Attempt conversion to HTML
418    if (!$output_type || ($output_type =~ m/html/i)) {
419    # formulate the command
420    $cmd = "";
421    $cmd .= "perl -S xlstohtml.pl ";
422    $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
423    $cmd .= " 2>\"$output_filestem.err\""
424        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
425   
426   
427    # execute the command
428    $!=0;
429    if (system($cmd)!=0)
430    {
431        print STDERR "Excel 95/97 converter failed $!\n";
432    } else {
433        return "html";
434    }
435    }
436
437    $success = &any_to_text($input_filename, $output_filestem);
438    if ($success) {
439    return "text";
440    }
441
442    return "fail";
443}
444
445
446
447# Find the real type of a .doc file
448#
449# We seem to have a lot of files with a .doc extension that are .rtf
450# files or Word 5 files.  This function attempts to tell the difference.
451sub find_docfile_type {
452    ($input_filename) = @_;
453   
454    open(CHK, "<$input_filename");
455    binmode(CHK);
456    my $line = "";
457    my $first = 1;
458
459    while (<CHK>) {
460   
461    $line = $_;
462
463    if ($first) {
464        # check to see if this is an rtf file
465        if ($line =~ m/^\{\\rtf/) {
466        close(CHK);
467        return "rtf";
468        }
469        $first = 0;
470    }
471   
472    # is this is a word 6/7/8 document?
473    if ($line =~ m/Word\.Document\.([678])/) {
474        close(CHK);
475        return "word$1";
476    }
477
478    }
479
480    return "unknown";
481}
482
483
484# Specific type-to-type conversions
485#
486# Each of the following functions attempts to convert a document from
487# a specific format to another.  If they succeed they return 1 and leave
488# the output document(s) in the appropriate place; if they fail they
489# return 0 and delete any working files.
490
491
492# Attempt to convert a word document to html with the wv program
493sub doc_to_html {
494    ($input_filename, $output_filestem) = @_;
495
496    my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
497                     $ENV{'GSDLOS'}, "wvWare");
498
499    # don't include path on windows (to avoid having to play about
500    # with quoting when GSDLHOME might contain spaces) but assume
501    # that the PATH is set up correctly
502    $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
503
504    my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
505                      "packages", "wv", "wvHtml.xml");
506   
507    # Added the following to work with replace_srcdoc_with_html.pl:
508    # Make wvWare put any associated (image) files of the word doc into
509    # folder docname-without-extention_files. This folder should be at
510    # the same level as the html file generated from the doc.
511    # wvWare will take care of proper interlinking.
512
513    # This step is necessary for replace_srcdoc_with_html.pl which will
514    # move the html and associated files into the import folder. We
515    # want to ensure that the associated files won't overwrite similarly
516    # named items already in import. Hence we put them in a folder first
517    # (to which the html links properly) and that will allow
518    # replace_srcdoc_with_html.pl to move them safely to /import.
519
520    # To do all this, we need to use wvWare's --dir and --basename options
521    # where dir is the full path to the image folder directory and
522    # basename is the full path to the image folder appended to the name
523    # which is to be prepended to every image file:
524    # eg. if the images were to have names like sample0.jpg to sampleN.jpg,
525    # then the basename is "/full/path/to/imgdir/sample".
526    # In this case, basename is the full path to and name of the document.
527    # HOWEVER: basename always takes full path, not relative url, so
528    # the greenstone browser is unable to display the images (absolute paths
529    # cause it to give an "external link" message)
530    # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
531    # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
532    # "added --dir option to wvHtml so that pictures can be placed in
533    # a seperate directory"
534    # "running wvWare through IMP to view word documents as html. It gets
535    # invoked like this:
536    # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
537   
538    # toppath is the folder where html is generated
539    # docname is the name (without extension) of the html to be generated
540    # suffix (extension) is thrown away
541    my ($docname, $toppath)
542    = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
543
544    # We want the image folder generated to have the same name as windows
545    # would generate ($windows_scripting) when it converts from word to html.
546    # That is, foldername=docname_files
547    my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
548    #print "assoc_dir: ".$assoc_dir."\n";  # same as "$output_filestem._files"
549   
550    # ensure this image directory exists
551    # if it exists already, just delete and recreate
552    if(-e $assoc_dir) {
553    &util::rm_r($assoc_dir);
554    } 
555    &util::mk_dir($assoc_dir);
556
557    # the images are all going to be called image0, image1,..., imageN
558    my $img_basenames = &util::filename_cat($assoc_dir, $docname);
559   
560    #print STDERR "****toppath: $toppath\n****docname: $docname\n;
561    #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
562    #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
563
564    my $cmd = "";
565    if ($timeout) {$cmd = "ulimit -t $timeout;";}
566    # wvWare's --dir and --basename options for image directory.
567    # Replaced the next line with the *2 lines* following it:
568               # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
569    $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
570    $cmd .= " --charset utf-8 --config \"$wv_conf\"";
571    $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
572
573    # redirecting STDERR is a bad idea on windows 95/98
574    $cmd .= " 2> \"$output_filestem.err\""
575    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
576    # execute the command
577    $!=0;
578    if (system($cmd)!=0)
579    {
580    print STDERR "Error executing wv converter:$!\n";
581    if (-s "$output_filestem.err") {
582        open (ERRFILE, "<$output_filestem.err");
583
584        my $write_to_fail_log=0;
585        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
586        {$write_to_fail_log=1;}
587
588        my $line;
589        while ($line=<ERRFILE>) {
590        if ($line =~ m/\w/) {
591            print STDERR "$line";
592            print FAILLOG "$line" if ($write_to_fail_log);
593        }
594        if ($line !~ m/startup error/) {next;}
595        print STDERR " (given an invalid .DOC file?)\n";
596        print FAILLOG " (given an invalid .DOC file?)\n"
597        if ($write_to_fail_log);
598       
599        } # while ERRFILE
600        close FAILLOG if ($write_to_fail_log);
601    }
602    return 0; # we can try any_to_text
603    }
604
605    # Was the conversion successful?
606
607    if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
608    open(TMP, "$output_filestem.html");
609    $line = <TMP>;
610    close(TMP);
611    if ($line && $line =~ m/DOCTYPE HTML/) {
612        &util::rm("$output_filestem.err") if -e "$output_filestem.err";   
613
614        # Inserted this code to remove the images directory if it was still empty after
615        # the html was generated (in case there were no images in the word document)
616        if (&util::is_dir_empty($assoc_dir)) {
617        #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n";
618        &util::rm_r($assoc_dir);
619        } else { # there was an image folder (it was generated)
620        # Therefore, the html file generated contains absolute links to the images
621        # Replace them with relative links instead, so the folder can be moved elsewhere
622        &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");   
623        }
624        return 1;
625    }
626    }
627   
628    # If here, an error of some sort occurred
629    &util::rm("$output_filestem.html") if -e "$output_filestem.html";
630    if (-e "$output_filestem.err") {
631    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
632        open (ERRLOG,"$output_filestem.err");
633        while (<ERRLOG>) {print FAILLOG $_;}
634        close FAILLOG;
635        close ERRLOG;
636    }
637    &util::rm("$output_filestem.err");
638    }
639   
640    return 0;
641}
642
643# Method to work with doc_to_html - Word docs might contain images.
644# When such word docs are converted with wvWare, we make it generate a
645# <filename>_files folder with the associated images, while the html file
646# <filename> refers to the images using absolute paths to <filename>_files.
647# This method reads in that html file and replaces all the absolute paths to
648# the images in <filename>_files with the relative paths to the images from
649# that folder. (I.e. with <filename>_files/<imagename.ext>).
650sub make_links_to_assocdir_relative{
651    # toppath is the top-level folder in which the html file we're going to be fixing resides
652    # docname is just the name (without extension) of the html file
653    # html_file is the full path to the html file: /full/path/docname.html
654    # assoc_dir_path is toppath/docname_files
655    # assoc_dirname is the directory name of the folder with associated imgs: docname_files
656    my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
657
658    # 1. Read all the contents of the html into a string
659    # open the original file for reading
660    unless(open(FIN, "<$html_file")) {
661    print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
662    return 0;
663    }
664    # From http://perl.plover.com/local.html
665    # "It's cheaper to read the file all at once, without all the splitting and reassembling.
666    # (Some people call this slurping the file.) Perl has a special feature to support this:
667    # If the $/ variable is undefined, the <...> operator will read the entire file all at once"
668    my $html_contents;
669    {
670    local $/ = undef;        # Read entire file at once
671    $html_contents = <FIN>;  # Now file is read in as one single 'line'
672    }
673    close(FIN); # close the file
674    #print STDERR $html_contents;
675   
676    # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src
677    # values with assoc_dirname
678    # At the end: g means substitute all occurrences (global), while s at the end means treat
679    # all new lines as a regular space. This interacts with g to consider all the lines
680    # together as a single line so that multi-occurrences can be replaced.
681
682    # we can't just replace $assoc_dir_path with $assoc_dir
683    # $assoc_dir_path represents a regular expression that needs to be replaced
684    # if it contains ., -, [, ], or Windows style backslashes in paths  -- which all have special
685    # meaning in Perl regular expressions -- we need to escape these first
686    my $safe_reg_expression = $assoc_dir_path;
687    $safe_reg_expression =~ s/\\/\\\\/g;
688    $safe_reg_expression =~ s/\./\\./g;
689    $safe_reg_expression =~ s/\-/\\-/g;
690    $safe_reg_expression =~ s/\[/\\[/g;
691    $safe_reg_expression =~ s/\]/\\]/g;
692    $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
693
694    # The following regular expression substitution looks for <a or <image, followed by any other
695    # attributes and values until it comes to the FIRST (indicated by ?) href= or src=
696    # followed by " or ' no quotes at all around path, followed by the associated folder's pathname
697    # followed by characters (for the img filename), then finally the optional closing quotes
698    # in " or ' form, followed by any other attributes and values until the first > to end the tag.
699    # The substitution: all the parts preceding associated folder's pathname are retained,
700    # the associated folder path name is replaced by associated folder directory name
701    # and the rest upto and including the closing > tag is retained.
702    # The sg at the end of the pattern match treats all of html_contents as a single line (s)
703    # and performs a global replace (g) meaning that all occurrences that match in that single line
704    # are substituted.
705    $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg;
706               #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
707    # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
708    $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
709
710    #print STDERR "****assoc_dirname: $assoc_dirname***\n";
711    #print STDERR "****safe_reg_expression: $safe_reg_expression***\n";
712   
713    # delete the original file and recreate it
714    my $copy_of_filename = $html_file;
715    &util::rm($copy_of_filename); # deleted the file
716
717    # Recreate the original file for writing the updated contents
718    unless(open(FOUT, ">$html_file")) {  # open it as a new file for writing
719    print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
720    return 0;
721    }
722
723    # write out the updated contents and close the file
724    print FOUT $html_contents;
725    close(FOUT);
726    return 1;
727}
728
729# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
730# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
731# introduced in link pathnames by wvWare into space again. Converts all percent signs
732# introduced by URL encoding filenames generated into %25 in these url links referencing them
733sub post_process_assocfile_urls
734{
735    my ($pre, $text, $post) = @_;
736
737    $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
738    $text =~ s/\\/\//g;
739    $text =~ s/%/%25/g;
740
741    return "$pre$text$post";
742}
743
744# Attempt to convert a word document to html with the word2html scripting program
745sub native_doc_to_html {
746    ($input_filename, $output_filestem) = @_;
747
748    my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
749                       $ENV{'GSDLOS'}, "word2html");
750
751    $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
752    if (-e "$output_filestem.html") {
753    print STDERR "*** The conversion file has existed\n";
754    return 1;
755    }
756
757    my $cmd = "";
758    if ($timeout) {$cmd = "ulimit -t $timeout;";}
759    #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
760    #$cmd .=  "$vbScript $input_filename $output_filestem.html";
761    $cmd .=  "$vbScript \"$input_filename\" \"$output_filestem.html\"";
762
763    # redirecting STDERR
764    $cmd .= " 2> \"$output_filestem.err\""
765    if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
766   
767    # execute the command
768    $!=0;
769    if (system($cmd)!=0)
770    {
771    print STDERR "Error executing word2Html converter:$!\n";
772    if (-s "$output_filestem.err") {
773        open (ERRFILE, "<$output_filestem.err");
774       
775        my $write_to_fail_log=0;
776        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
777        {$write_to_fail_log=1;}
778
779        my $line;
780        while ($line=<ERRFILE>) {
781        if ($line =~ m/\w/) {
782            print STDERR "$line";
783            print FAILLOG "$line" if ($write_to_fail_log);
784        }
785        if ($line !~ m/startup error/) {next;}
786        print STDERR " (given an invalid .DOC file?)\n";
787        print FAILLOG " (given an invalid .DOC file?)\n"
788        if ($write_to_fail_log);
789       
790        } # while ERRFILE
791        close FAILLOG if ($write_to_fail_log);
792    }
793    return 0; # we can try any_to_text
794    }
795
796    # Was the conversion successful?
797    if (-s "$output_filestem.html") {
798    open(TMP, "$output_filestem.html");
799    $line = <TMP>;
800    close(TMP);
801    if ($line && $line =~ m/html/) {
802        &util::rm("$output_filestem.err") if -e "$output_filestem.err";
803        return 1;
804    }
805    }
806   
807    # If here, an error of some sort occurred
808    &util::rm("$output_filestem.html") if -e "$output_filestem.html";
809    if (-e "$output_filestem.err") {
810    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
811        open (ERRLOG,"$output_filestem.err");
812        while (<ERRLOG>) {print FAILLOG $_;}
813        close FAILLOG;
814        close ERRLOG;
815    }
816    &util::rm("$output_filestem.err");
817    }
818    return 0;
819}
820
821# Attempt to convert an RTF document to html with rtftohtml
822
823sub rtf_to_html {
824    my ($input_filename, $output_filestem) = @_;
825
826    # formulate the command
827    $cmd = "";
828    if ($timeout) {$cmd = "ulimit -t $timeout;";}
829    $cmd .= "rtftohtml";
830    #$cmd .= "rtf-converter";
831
832    $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
833
834    $cmd .= " 2>\"$output_filestem.err\""
835        if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
836
837
838    # execute the command
839    $!=0;
840    if (system($cmd)!=0)
841    {
842    print STDERR "Error executing rtf converter $!\n";
843    # don't currently bother printing out error log...
844    # keep going, in case it still created an HTML file...
845    }
846
847    # Was the conversion successful?
848    my $was_successful=0;
849    if (-s "$output_filestem.html") {
850    # make sure we have some content other than header
851    open (HTML, "$output_filestem.html"); # what to do if fail?
852    my $line;
853    my $past_header=0;
854    while ($line=<HTML>) {
855
856        if ($past_header == 0) {
857        if ($line =~ m/<body>/) {$past_header=1;}
858        next;
859        }
860
861        $line =~ s/<[^>]+>//g;
862        if ($line =~ m/\w/ && $past_header) {  # we found some content...
863        $was_successful=1;
864        last;
865        }
866    }
867    close HTML;
868    }
869
870    if ($was_successful) {
871    &util::rm("$output_filestem.err")
872        if (-e "$output_filestem.err");
873    # insert the (modified) table of contents, if it exists.
874    if (-e "${output_filestem}_ToC.html") {
875        &util::mv("$output_filestem.html","$output_filestem.src");
876        my $open_failed=0;
877        open HTMLSRC, "$output_filestem.src" || ++$open_failed;
878        open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
879        open HTML, ">$output_filestem.html" || ++$open_failed;
880       
881        if ($open_failed) {
882        close HTMLSRC;
883        close TOC;
884        close HTML;
885        &util::mv("$output_filestem.src","$output_filestem.html");
886        return 1;
887        }
888
889        # print out header info from src html.
890        while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
891        print HTML "$_";
892        }
893
894        # print out table of contents, making links relative
895        <TOC>; <TOC>; # ignore first 2 lines
896        print HTML scalar(<TOC>); # line 3 = "<ol>\n"
897        my $line;
898        while ($line=<TOC>) {
899        $line =~ s@</body></html>$@@ ; # only last line has this
900        # make link relative
901        $line =~ s@href=\"[^\#]+@href=\"@;
902        print HTML $line;
903        }
904        close TOC;
905
906        # rest of html src
907        while (<HTMLSRC>) {
908        print HTML $_;
909        }
910        close HTMLSRC;
911        close HTML;
912
913        &util::rm("${output_filestem}_ToC.html");
914        &util::rm("${output_filestem}.src");
915    }
916    # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
917    return 1; # success
918    }
919
920    if (-e "$output_filestem.err") {
921    if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
922    {
923        print FAILLOG "Error - rtftohtml - couldn't extract text\n";
924        #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
925        print FAILLOG " (rtf file might be too recent):\n";
926        open (ERRLOG, "$output_filestem.err");
927        while (<ERRLOG>) {print FAILLOG $_;}
928        close ERRLOG;
929        close FAILLOG;
930    }
931    &util::rm("$output_filestem.err");
932    }
933
934    &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
935
936    return 0;
937}
938
939
940# Convert a pdf file to html with the pdftohtml command
941
942sub pdf_to_html {
943    my ($dirname, $input_filename, $output_filestem) = @_;
944
945    $cmd = "";
946    if ($timeout) {$cmd = "ulimit -t $timeout;";}
947    $cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
948    $cmd .= " -c" if ($pdf_complex);
949    $cmd .= " -i" if ($pdf_ignore_images);
950    $cmd .= " -a" if ($pdf_allow_images_only);
951    $cmd .= " -hidden" unless ($pdf_nohidden);
952    $cmd .= " \"$input_filename\" \"$output_filestem\"";
953   
954    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
955    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
956    } else {
957    $cmd .= " > \"$output_filestem.err\"";
958    }
959
960    $!=0;
961
962    my $retval=system($cmd);
963    if ($retval!=0)
964    {
965    print STDERR "Error executing pdftohtml.pl";
966    if ($!) {print STDERR ": $!";}
967    print STDERR "\n";
968    }
969
970    # make sure the converter made something
971    if ($retval!=0 || ! -s "$output_filestem.html")
972    {
973    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
974    # print out the converter's std err, if any
975    if (-s "$output_filestem.err") {
976        open (ERRLOG, "$output_filestem.err") || die "$!";
977        print STDERR "pdftohtml error log:\n";
978        while (<ERRLOG>) {
979        print STDERR "$_";
980        }
981        close ERRLOG;
982    }
983    &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
984    if (-e "$output_filestem.err") {
985        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
986        {
987        open (ERRLOG, "$output_filestem.err");
988        while (<ERRLOG>) {print FAILLOG $_;}
989        close ERRLOG;
990        close FAILLOG;
991        }   
992        &util::rm("$output_filestem.err");
993    }
994    return 0;
995    }
996
997    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
998    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
999    return 1;
1000}
1001
1002# Convert a pdf file to various types of image with the convert command
1003
1004sub pdfps_to_img {
1005    my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
1006
1007    # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
1008    if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
1009    my $result = `identify 2>&1`;
1010    if ($? == -1 || $? == 256) {  # Linux and Windows return different values for "program not found"
1011        #ImageMagick is not installed, thus the convert utility is not available.
1012        print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
1013        return 0;
1014    }
1015    }
1016
1017    $cmd = "";
1018    if ($timeout) {$cmd = "ulimit -t $timeout;";}
1019    $output_type =~ s/.*\_(.*)/$1/i;
1020    $cmd .= "perl -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
1021    if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
1022    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1023    } else {
1024    $cmd .= " > \"$output_filestem.err\"";
1025    }
1026
1027    # don't include path on windows (to avoid having to play about
1028    # with quoting when GSDLHOME might contain spaces) but assume
1029    # that the PATH is set up correctly
1030    $!=0;
1031    my $retval=system($cmd);
1032    if ($retval!=0)
1033    {
1034    print STDERR "Error executing pdftoimg.pl";
1035    if ($!) {print STDERR ": $!";}
1036    print STDERR "\n";
1037    }
1038
1039    #make sure the converter made something
1040    #if ($retval !=0) || ! -s "$output_filestem")
1041    if ($retval !=0)
1042    {
1043    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1044    #print out the converter's std err, if any
1045    if (-s "$output_filestem.err") {
1046        open (ERRLOG, "$output_filestem.err") || die "$!";
1047        print STDERR "pdfpstoimg error log:\n";
1048        while (<ERRLOG>) {
1049        print STDERR "$_";
1050        }
1051        close ERRLOG;
1052    }
1053    #&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1054    if (-e "$output_filestem.err") {
1055        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1056        {
1057        open (ERRLOG, "$output_filestem.err");
1058        while (<ERRLOG>) {print FAILLOG $_;}
1059        close ERRLOG;
1060        close FAILLOG;
1061       }   
1062        &util::rm("$output_filestem.err");
1063    }
1064    return 0;
1065    }
1066    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1067    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1068    return 1;
1069}
1070
1071# Convert a PDF file to text with the pdftotext command
1072
1073sub pdf_to_text {
1074    my ($dirname, $input_filename, $output_filestem) = @_;
1075
1076    my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1077
1078    if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1079    $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1080    } else {
1081    $cmd .= " > \"$output_filestem.err\"";
1082    }
1083   
1084    if (system($cmd)!=0)
1085    {
1086    print STDERR "Error executing $cmd: $!\n";
1087    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1088    }
1089
1090    # make sure there is some extracted text.
1091    if (-e "$output_filestem.text") {
1092    open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
1093    binmode(EXTR_TEXT); # just in case...
1094    my $line="";
1095    my $seen_text=0;
1096    while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1097        if ($line=~ m/\w/) {$seen_text=1;}
1098    }
1099    close EXTR_TEXT;
1100    if ($seen_text==0) { # no text was extracted
1101        print STDERR "Error: pdftotext found no text\n";
1102        &util::rm("$output_filestem.text");
1103    }
1104    }
1105
1106    # make sure the converter made something
1107    if (! -s "$output_filestem.text")
1108    {
1109    # print out the converters std err, if any
1110    if (-s "$output_filestem.err") {
1111        open (ERRLOG, "$output_filestem.err") || die "$!";
1112        print STDERR "pdftotext error log:\n";
1113        while (<ERRLOG>) {
1114        print STDERR "$_";
1115        }
1116        close ERRLOG;
1117    }
1118    # does this converter create a .out file?
1119    &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1120    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1121    if (-e "$output_filestem.err") {
1122        if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1123        {
1124        open (ERRLOG,"$output_filestem.err");
1125        while (<ERRLOG>) {print FAILLOG $_;}
1126        close ERRLOG;
1127        close FAILLOG;
1128        }
1129        &util::rm("$output_filestem.err");
1130    }
1131    return 0;
1132    }
1133    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1134    return 1;
1135}
1136
1137# Convert a PostScript document to text
1138# note - just using "ps2ascii" isn't good enough, as it
1139# returns 0 for a postscript interpreter error. ps2ascii is just
1140# a wrapper to "gs" anyway, so we use that cmd here.
1141
1142sub ps_to_text {
1143    my ($input_filename, $output_filestem) = @_;
1144
1145    my $error = "";
1146
1147    # if we're on windows we'll fall straight through without attempting
1148    # to use gs
1149    if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1150    $error = "Windows does not support gs";
1151
1152    } else {
1153    my $cmd = "";
1154    if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1155    $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1156    $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1157    #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1158    $cmd .= " 2> $output_filestem.err";
1159    $!=0;
1160
1161    my $retcode=system($cmd);
1162    $retcode = $? >> 8;  # see man perlfunc - system for this...
1163    # if system returns -1 | 127 (couldn't start program), look at $! for message
1164
1165    if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1166    elsif (! -e "$output_filestem.text") {
1167        $error="did not create output file.\n";
1168    }
1169    else
1170    {   # make sure the interpreter didn't get an error. It is technically
1171        # possible for the actual text to start with this, but....
1172        open PSOUT, "$output_filestem.text";
1173        if (<PSOUT> =~ m/^Error: (.*)/) {
1174        $error="interpreter error - \"$1\"";
1175        }
1176        close PSOUT;
1177    }
1178    }
1179
1180    if ($error ne "")
1181    {
1182    print STDERR "Warning: Error executing gs: $error\n";
1183    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1184
1185    if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1186    {
1187        print FAILLOG "gs - $error\n";
1188        if (-e "$output_filestem.err") {
1189        open(ERRLOG, "$output_filestem.err");
1190        while (<ERRLOG>) {print FAILLOG $_;}
1191        close ERRLOG;
1192        }
1193        close FAILLOG;
1194    }
1195    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1196
1197
1198    # Fine then. We'll just do a lousy job by ourselves...
1199    # Based on 5-line regexp sed script found at:
1200    # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1201    #
1202    print STDERR "Stripping text from postscript\n";
1203    my $errorcode=0;
1204    open (IN, "$input_filename")
1205        ||  ($errorcode=1, warn "Couldn't read file: $!");
1206    open (OUT, ">$output_filestem.text")
1207        ||  ($errorcode=1, warn "Couldn't write file: $!");
1208    if ($errorcode) {print STDERR "errors\n";return 0;}
1209   
1210    my $text="";  # this is for whole .ps file...
1211    $text = join('', <IN>); # see man perlport, under "System Resources"
1212    close IN;
1213
1214    # Make sure this is a ps file...
1215    if ($text !~ m/^%!/) {
1216        print STDERR "Bad postscript header: not '%!'\n";
1217        if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1218        {
1219        print FAILLOG "Bad postscript header: not '%!'\n";
1220        close FAILLOG;
1221        }
1222        return 0;
1223    }
1224
1225    # if ps has Page data, then use it to delete all stuff before it.
1226    $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1227   
1228    # remove all leading non-data stuff
1229    $text =~ s/^.*?\(//s;
1230
1231    # remove all newline chars for easier processing
1232    $text =~ s/\n//g;
1233   
1234    # Big assumption here - assume that if any co-ordinates are
1235    # given, then we are at the end of a sentence.
1236    $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1237
1238    # special characters--
1239    $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1240
1241    # ? ps text formatting (eg italics?) ?
1242    $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1243    $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1244    $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1245    # default - remove the rest
1246    $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1247
1248    # attempt to add whitespace between words...
1249    # this is based purely on observation, and may be completely wrong...
1250    $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1251    # eg I notice "b(" is sometimes NOT a space if preceded by a
1252    # negative number.
1253    $text =~ s/\)\d+ ?b\(/\) \( /g;
1254
1255    # change quoted braces to brackets
1256    $text =~ s/([^\\])\\\(/$1\{/g;
1257    $text =~ s/([^\\])\\\)/$1\}/g ;
1258
1259    # remove everything that is not between braces
1260    $text =~ s/\)([^\(\)])+?\(//sg ;
1261   
1262    # remove any Trailer eof stuff.
1263    $text =~ s/\)[^\)]*$//sg;
1264
1265    ### ligatures have special characters...
1266    $text =~ s/\\013/ff/g;
1267    $text =~ s/\\014/fi/g;
1268    $text =~ s/\\015/fl/g;
1269    $text =~ s/\\016/ffi/g;
1270    $text =~ s/\\214/fi/g;
1271    $text =~ s/\\215/fl/g;
1272    $text =~ s/\\017/\n\* /g; # asterisk?
1273    $text =~ s/\\023/\023/g;  # e acute ('e)
1274    $text =~ s/\\177/\252/g;  # u"
1275#   $text =~ s/ ?? /\344/g;  # a"
1276
1277    print OUT "$text";
1278    close OUT;
1279    }
1280    # wrap the text - use a minimum length. ie, first space after this length.
1281    my $wrap_length=72;
1282    &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1283    open INFILE, "$output_filestem.text.tmp" ||
1284    die "Couldn't open file: $!";
1285    open OUTFILE, ">$output_filestem.text" ||
1286    die "Couldn't open file for writing: $!";
1287    my $line="";
1288    while ($line=<INFILE>) {
1289    while (length($line)>0) {
1290        if (length($line)>$wrap_length) {
1291        $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1292        print OUTFILE "$1\n";
1293        } else {
1294        print OUTFILE "$line";
1295        $line="";
1296        }
1297    }
1298    }
1299    close INFILE;
1300    close OUTFILE;
1301    &util::rm("$output_filestem.text.tmp");
1302
1303    &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1304    return 1;
1305}
1306
1307
1308# Convert any file to HTML with a crude perl implementation of the
1309# UNIX strings command.
1310
1311sub any_to_html {
1312    ($input_filename, $output_filestem) = @_;
1313
1314    # First generate a text file
1315    return 0 unless (&any_to_text($input_filename, $output_filestem));
1316
1317    # create an HTML file from the text file
1318    open(TEXT, "<$output_filestem.text");
1319    open(HTML, ">$output_filestem.html");
1320
1321    print HTML "<html><head>\n";
1322    print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1323    print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1324    print HTML "</head><body>\n\n";
1325
1326    my $line;
1327    while ($line=<TEXT>) {
1328    $line =~ s/</&lt;/g;
1329    $line =~ s/>/&gt;/g;
1330    if ($line =~ m/^\s*$/) {
1331        print HTML "<p>";
1332    } else {
1333        print HTML "<br> ", $line;
1334    }
1335    }
1336    print HTML "\n</body></html>\n";
1337
1338    close HTML;
1339    close TEXT;
1340
1341    &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1342    return 1;
1343}
1344
1345# Convert any file to TEXT with a crude perl implementation of the
1346# UNIX strings command.
1347# Note - this assumes ascii charsets :(     (jrm21)
1348
1349sub any_to_text {
1350    ($input_filename, $output_filestem) = @_;
1351
1352    if (!$use_strings) {
1353      return 0;
1354    }
1355
1356    print STDERR "\n**** In any to text****\n\n";
1357    open(IN, "<$input_filename") || return 0;
1358    binmode(IN);
1359    open(OUT, ">$output_filestem.text") || return 0;
1360
1361    my ($line);
1362    my $output_line_count = 0;
1363    while (<IN>) {
1364    $line = $_;
1365
1366    # delete anything that isn't a printable character
1367    $line =~ s/[^\040-\176]+/\n/sg;
1368
1369    # delete any string less than 10 characters long
1370    $line =~ s/^.{0,9}$/\n/mg;
1371    while ($line =~ m/^.{1,9}$/m) {
1372        $line =~ s/^.{0,9}$/\n/mg;
1373        $line =~ s/\n+/\n/sg;
1374    }
1375
1376    # remove extraneous whitespace
1377    $line =~ s/\n+/\n/gs;
1378    $line =~ s/^\n//gs;
1379
1380    # output whatever is left
1381    if ($line =~ m/[^\n ]/) {
1382        print OUT $line;
1383        ++$output_line_count;
1384    }
1385    }
1386
1387    close OUT;
1388    close IN;
1389
1390    if ($output_line_count) { # try to protect against binary only formats
1391    return 1;
1392    }
1393
1394    &util::rm("$output_filestem.text");
1395    return 0;
1396
1397}
Note: See TracBrowser for help on using the browser.