source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 24169

Last change on this file since 24169 was 24169, checked in by ak19, 13 years ago

Wait message added to gsConvert.pl, since docx to html can take long (as long as Word takes to convert the file). Added useful links to SaveAs formats to docx2html file.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 46.8 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49
50 if(!$ENV{'PERLPATH'}) {
51 my $full_perl_exec = $^X;
52 require File::Basename;
53 my $perl_path = File::Basename::dirname($full_perl_exec);
54 $ENV{'PERLPATH'} = $perl_path;
55 }
56
57}
58
59use strict;
60
61use parsargv;
62use util;
63use Cwd;
64
65# Are we running on WinNT or Win2000 (or later)?
66my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
67if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
68
69my $use_strings;
70my $pdf_complex;
71my $pdf_nohidden;
72my $pdf_zoom;
73my $pdf_ignore_images;
74my $pdf_allow_images_only;
75my $windows_scripting;
76
77sub print_usage
78{
79 print STDERR "\n";
80 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
81 print STDERR " or text using third-party programs.\n\n";
82 print STDERR " usage: $0 [options] filename\n";
83 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
84 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
85 print STDERR "\t-output\tauto|html|text|pagedimg_jpg|pagedimg_gif|pagedimg_png\t(output file type)\n";
86 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
87 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
88 print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
89 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
90 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
91 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
92 print STDERR "\t\tconverting PDF to HTML\n";
93 print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
94 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
95 print STDERR "\t\t-pdf_complex is set\n";
96 exit(1);
97}
98
99my $faillogfile="";
100my $timeout=0;
101
102sub main
103{
104 my (@ARGV) = @_;
105 my ($input_type,$output_type,$verbose);
106
107 # Dynamically figure out what the --type option can support, based on whether -windows_scripting
108 # is in use or not
109 my $default_type_re = "(doc|dot|pdf|ps|ppt|rtf|xls)";
110 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
111 #my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xlsx?)";
112 # Currently only have VBA for Word and PPT(but no XLS)
113 my $enhanced_type_re = "(docx?|dot|pdf|ps|pptx?|rtf|xls)";
114
115 my $type_re = $default_type_re;
116
117 foreach my $a (@ARGV) {
118 if ($a =~ m/^windows_scripting$/i) {
119 $type_re = $enhanced_type_re;
120 }
121 }
122
123 # read command-line arguments
124 if (!parsargv::parse(\@ARGV,
125 "type/$type_re/", \$input_type,
126 '/errlog/.*/', \$faillogfile,
127 'output/(auto|html|text|pagedimg).*/', \$output_type,
128 'timeout/\d+/0',\$timeout,
129 'verbose/\d+/0', \$verbose,
130 'windows_scripting',\$windows_scripting,
131 'use_strings', \$use_strings,
132 'pdf_complex', \$pdf_complex,
133 'pdf_ignore_images', \$pdf_ignore_images,
134 'pdf_allow_images_only', \$pdf_allow_images_only,
135 'pdf_nohidden', \$pdf_nohidden,
136 'pdf_zoom/\d+/2', \$pdf_zoom
137 ))
138 {
139 print_usage();
140 }
141
142 # Make sure the input file exists and can be opened for reading
143 if (scalar(@ARGV!=1)) {
144 print_usage();
145 }
146
147 my $input_filename = $ARGV[0];
148 if (!-r $input_filename) {
149 print STDERR "Error: unable to open $input_filename for reading\n";
150 exit(1);
151 }
152
153 # Deduce filenames
154 my ($tailname,$dirname,$suffix)
155 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
156 my $output_filestem = &util::filename_cat($dirname, "$tailname");
157
158 if ($input_type eq "")
159 {
160 $input_type = lc (substr($suffix,1,length($suffix)-1));
161 }
162
163 # Change to temporary working directory
164 my $stored_dir = cwd();
165 chdir ($dirname) || die "Unable to change to directory $dirname";
166
167 # Select convert utility
168 if (!defined $input_type) {
169 print STDERR "Error: No filename extension or input type defined\n";
170 exit(1);
171 }
172 elsif ($input_type =~ m/^docx?$/ || $input_type eq "dot") {
173 print &convertDOC($input_filename, $output_filestem, $output_type);
174 print "\n";
175 }
176 elsif ($input_type eq "rtf") {
177 print &convertRTF($input_filename, $output_filestem, $output_type);
178 print "\n";
179 }
180 elsif ($input_type eq "pdf") {
181 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
182 print "\n";
183 }
184 elsif ($input_type eq "ps") {
185 print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
186 print "\n";
187 }
188 elsif ($input_type =~ m/pptx?$/) {
189 print &convertPPT($input_filename, $output_filestem, $output_type);
190 print "\n";
191 }
192 elsif ($input_type =~ m/xlsx?$/) {
193 print &convertXLS($input_filename, $output_filestem, $output_type);
194 print "\n";
195 }
196 else {
197 print STDERR "Error: Unable to convert type '$input_type'\n";
198 exit(1);
199 }
200
201 # restore to original working directory
202 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
203
204}
205
206&main(@ARGV);
207
208
209
210# Document-type conversion functions
211#
212# The following functions attempt to convert documents from their
213# input type to the specified output type. If no output type was
214# given, then they first attempt HTML, and then TEXT.
215#
216# Each returns the output type ("html" or "text") or "fail" if no
217# conversion is possible.
218
219# Convert a Microsoft word document
220
221sub convertDOC {
222 my ($input_filename, $output_filestem, $output_type) = @_;
223
224 # Many .doc files are not in fact word documents!
225 my $realtype = &find_docfile_type($input_filename);
226
227 if ($realtype eq "word6" || $realtype eq "word7"
228 || $realtype eq "word8" || $realtype eq "docx") {
229 return &convertWord678($input_filename, $output_filestem, $output_type);
230 } elsif ($realtype eq "rtf") {
231 return &convertRTF($input_filename, $output_filestem, $output_type);
232 } else {
233 return &convertAnything($input_filename, $output_filestem, $output_type);
234 }
235}
236
237# Convert a Microsoft word 6/7/8 document
238
239sub convertWord678 {
240 my ($input_filename, $output_filestem, $output_type) = @_;
241
242 my $success = 0;
243 if (!$output_type || ($output_type =~ m/html/i)){
244 if ($windows_scripting) {
245 $success = &native_doc_to_html($input_filename, $output_filestem);
246 }
247 else {
248 $success = &doc_to_html($input_filename, $output_filestem);
249 }
250 if ($success) {
251 return "html";
252 }
253 }
254 return &convertAnything($input_filename, $output_filestem, $output_type);
255}
256
257
258# Convert a Rich Text Format (RTF) file
259
260sub convertRTF {
261 my ($input_filename, $output_filestem, $output_type) = @_;
262
263 my $success = 0;
264
265 # Attempt specialised conversion to HTML
266 if (!$output_type || ($output_type =~ m/html/i)) {
267
268 if ($windows_scripting) {
269 $success = &native_doc_to_html($input_filename, $output_filestem);
270 }
271 else {
272 $success = &rtf_to_html($input_filename, $output_filestem);
273 }
274 if ($success) {
275 return "html";
276 }
277 }
278
279# rtf is so ugly that's it's not worth running strings over.
280# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
281# return &convertAnything($input_filename, $output_filestem, $output_type);
282 return "fail";
283}
284
285
286# Convert an unidentified file
287
288sub convertAnything {
289 my ($input_filename, $output_filestem, $output_type) = @_;
290
291 my $success = 0;
292
293 # Attempt simple conversion to HTML
294 if (!$output_type || ($output_type =~ m/html/i)) {
295 $success = &any_to_html($input_filename, $output_filestem);
296 if ($success) {
297 return "html";
298 }
299 }
300
301 # Convert to text
302 if (!$output_type || ($output_type =~ m/text/i)) {
303 $success = &any_to_text($input_filename, $output_filestem);
304 if ($success) {
305 return "text";
306 }
307 }
308 return "fail";
309}
310
311
312
313# Convert an Adobe PDF document
314
315sub convertPDF {
316 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
317
318 my $success = 0;
319 $output_type =~ s/.*\-(.*)/$1/i;
320 # Attempt coversion to Image
321 if ($output_type =~ m/jp?g|gif|png/i) {
322 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
323 if ($success){
324 return "item";
325 }
326 }
327
328 # Attempt conversion to HTML
329 if (!$output_type || ($output_type =~ m/html/i)) {
330 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
331 if ($success) {
332 return "html";
333 }
334 }
335
336 # Attempt conversion to TEXT
337 if (!$output_type || ($output_type =~ m/text/i)) {
338 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
339 if ($success) {
340 return "text";
341 }
342 }
343
344 return "fail";
345
346}
347
348
349# Convert an Adobe PostScript document
350
351sub convertPS {
352 my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
353
354 my $success = 0;
355 $output_type =~ s/.*\-(.*)/$1/i;
356 # Attempt coversion to Image
357 if ($output_type =~ m/jp?g|gif|png/i) {
358 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
359 if ($success){
360 return "item";
361 }
362 }
363
364 # Attempt conversion to TEXT
365 if (!$output_type || ($output_type =~ m/text/i)) {
366 $success = &ps_to_text($input_filename, $output_filestem);
367 if ($success) {
368 return "text";
369 }
370 }
371 return "fail";
372}
373
374
375sub convertPPT {
376 my ($input_filename, $output_filestem, $output_type) = @_;
377 my $success = 0;
378
379 my $ppt_convert_type = "";
380
381 #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
382 if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
383 if ($output_type =~ m/gif/i) {
384 $ppt_convert_type = "-g";
385 } elsif ($output_type =~ m/jp?g/i){
386 $ppt_convert_type = "-j";
387 } elsif ($output_type =~ m/png/i){
388 $ppt_convert_type = "-p";
389 }
390 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
391 $ENV{'GSDLOS'}, "pptextract");
392 $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
393
394 my $cmd = "";
395 if ($timeout) {$cmd = "ulimit -t $timeout;";}
396 # if the converting directory already exists
397 if (-d $output_filestem) {
398 print STDERR "**The conversion directory already exists\n";
399 return "item";
400 } else {
401 $cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
402 $cmd .= " 2>\"$output_filestem.err\""
403 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
404 if (system($cmd) !=0) {
405 print STDERR "Powerpoint VB Scripting convert failed\n";
406 } else {
407 return "item";
408 }
409 }
410 } elsif (!$output_type || ($output_type =~ m/html/i)) {
411 # Attempt conversion to HTML
412 #if (!$output_type || ($output_type =~ m/html/i)) {
413 # formulate the command
414 my $cmd = "";
415 my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
416 $cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
417 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
418 $cmd .= " 2>\"$output_filestem.err\""
419 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
420
421 # execute the command
422 $!=0;
423 if (system($cmd)!=0)
424 {
425 print STDERR "Powerpoint 95/97 converter failed $!\n";
426 } else {
427 return "html";
428 }
429 }
430
431 $success = &any_to_text($input_filename, $output_filestem);
432 if ($success) {
433 return "text";
434 }
435
436 return "fail";
437}
438
439
440sub convertXLS {
441 my ($input_filename, $output_filestem, $output_type) = @_;
442
443 my $success = 0;
444
445 # Attempt conversion to HTML
446 if (!$output_type || ($output_type =~ m/html/i)) {
447 # formulate the command
448 my $cmd = "";
449 my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
450 $cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
451 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
452 $cmd .= " 2>\"$output_filestem.err\""
453 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
454
455
456 # execute the command
457 $!=0;
458 if (system($cmd)!=0)
459 {
460 print STDERR "Excel 95/97 converter failed $!\n";
461 } else {
462 return "html";
463 }
464 }
465
466 $success = &any_to_text($input_filename, $output_filestem);
467 if ($success) {
468 return "text";
469 }
470
471 return "fail";
472}
473
474
475
476# Find the real type of a .doc file
477#
478# We seem to have a lot of files with a .doc extension that are .rtf
479# files or Word 5 files. This function attempts to tell the difference.
480sub find_docfile_type {
481 my ($input_filename) = @_;
482
483 if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
484 return "docx";
485 }
486
487 open(CHK, "<$input_filename");
488 binmode(CHK);
489 my $line = "";
490 my $first = 1;
491
492 while (<CHK>) {
493
494 $line = $_;
495
496 if ($first) {
497 # check to see if this is an rtf file
498 if ($line =~ m/^\{\\rtf/) {
499 close(CHK);
500 return "rtf";
501 }
502 $first = 0;
503 }
504
505 # is this is a word 6/7/8 document?
506 if ($line =~ m/Word\.Document\.([678])/) {
507 close(CHK);
508
509 return "word$1";
510 }
511
512 }
513
514 return "unknown";
515}
516
517
518# Specific type-to-type conversions
519#
520# Each of the following functions attempts to convert a document from
521# a specific format to another. If they succeed they return 1 and leave
522# the output document(s) in the appropriate place; if they fail they
523# return 0 and delete any working files.
524
525
526# Attempt to convert a word document to html with the wv program
527sub doc_to_html {
528 my ($input_filename, $output_filestem) = @_;
529
530 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
531
532 if ( -d "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv" && $ENV{'GSDLOS'} eq "linux" ) {
533 $ENV{'PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/bin:$ENV{'PATH'}";
534 $ENV{'LD_LIBRARY_PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/lib:$ENV{'LD_LIBRARY_PATH'}";
535 $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv", "bin", "wvWare");
536 }
537
538 # don't include path on windows (to avoid having to play about
539 # with quoting when GSDLHOME might contain spaces) but assume
540 # that the PATH is set up correctly
541 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
542
543 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
544 "packages", "wv", "wvHtml.xml");
545
546 # Added the following to work with replace_srcdoc_with_html.pl:
547 # Make wvWare put any associated (image) files of the word doc into
548 # folder docname-without-extention_files. This folder should be at
549 # the same level as the html file generated from the doc.
550 # wvWare will take care of proper interlinking.
551
552 # This step is necessary for replace_srcdoc_with_html.pl which will
553 # move the html and associated files into the import folder. We
554 # want to ensure that the associated files won't overwrite similarly
555 # named items already in import. Hence we put them in a folder first
556 # (to which the html links properly) and that will allow
557 # replace_srcdoc_with_html.pl to move them safely to /import.
558
559 # To do all this, we need to use wvWare's --dir and --basename options
560 # where dir is the full path to the image folder directory and
561 # basename is the full path to the image folder appended to the name
562 # which is to be prepended to every image file:
563 # eg. if the images were to have names like sample0.jpg to sampleN.jpg,
564 # then the basename is "/full/path/to/imgdir/sample".
565 # In this case, basename is the full path to and name of the document.
566 # HOWEVER: basename always takes full path, not relative url, so
567 # the greenstone browser is unable to display the images (absolute paths
568 # cause it to give an "external link" message)
569 # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
570 # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
571 # "added --dir option to wvHtml so that pictures can be placed in
572 # a seperate directory"
573 # "running wvWare through IMP to view word documents as html. It gets
574 # invoked like this:
575 # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
576
577 # toppath is the folder where html is generated
578 # docname is the name (without extension) of the html to be generated
579 # suffix (extension) is thrown away
580 my ($docname, $toppath)
581 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
582
583 # We want the image folder generated to have the same name as windows
584 # would generate ($windows_scripting) when it converts from word to html.
585 # That is, foldername=docname_files
586 my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
587 #print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
588
589 # ensure this image directory exists
590 # if it exists already, just delete and recreate
591 if(-e $assoc_dir) {
592 &util::rm_r($assoc_dir);
593 }
594 &util::mk_dir($assoc_dir);
595
596 # the images are all going to be called image0, image1,..., imageN
597 my $img_basenames = &util::filename_cat($assoc_dir, $docname);
598
599 #print STDERR "****toppath: $toppath\n****docname: $docname\n;
600 #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
601 #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
602
603 my $cmd = "";
604 if ($timeout) {$cmd = "ulimit -t $timeout;";}
605 # wvWare's --dir and --basename options for image directory.
606 # Replaced the next line with the *2 lines* following it:
607 # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
608 $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
609 $cmd .= " --charset utf-8 --config \"$wv_conf\"";
610 $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
611
612 # redirecting STDERR is a bad idea on windows 95/98
613 $cmd .= " 2> \"$output_filestem.err\""
614 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
615 # execute the command
616 $!=0;
617 if (system($cmd)!=0)
618 {
619 print STDERR "Error executing wv converter:$!\n";
620 if (-s "$output_filestem.err") {
621 open (ERRFILE, "<$output_filestem.err");
622
623 my $write_to_fail_log=0;
624 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
625 {$write_to_fail_log=1;}
626
627 my $line;
628 while ($line=<ERRFILE>) {
629 if ($line =~ m/\w/) {
630 print STDERR "$line";
631 print FAILLOG "$line" if ($write_to_fail_log);
632 }
633 if ($line !~ m/startup error/) {next;}
634 print STDERR " (given an invalid .DOC file?)\n";
635 print FAILLOG " (given an invalid .DOC file?)\n"
636 if ($write_to_fail_log);
637
638 } # while ERRFILE
639 close FAILLOG if ($write_to_fail_log);
640 }
641 return 0; # we can try any_to_text
642 }
643
644 # Was the conversion successful?
645
646 if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
647 open(TMP, "$output_filestem.html");
648 my $line = <TMP>;
649 close(TMP);
650 if ($line && $line =~ m/DOCTYPE HTML/) {
651 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
652
653 # Inserted this code to remove the images directory if it was still empty after
654 # the html was generated (in case there were no images in the word document)
655 if (&util::is_dir_empty($assoc_dir)) {
656 #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n";
657 &util::rm_r($assoc_dir);
658 } else { # there was an image folder (it was generated)
659 # Therefore, the html file generated contains absolute links to the images
660 # Replace them with relative links instead, so the folder can be moved elsewhere
661 &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
662 }
663 return 1;
664 }
665 }
666
667 # If here, an error of some sort occurred
668 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
669 if (-e "$output_filestem.err") {
670 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
671 open (ERRLOG,"$output_filestem.err");
672 while (<ERRLOG>) {print FAILLOG $_;}
673 close FAILLOG;
674 close ERRLOG;
675 }
676 &util::rm("$output_filestem.err");
677 }
678
679 return 0;
680}
681
682# Method to work with doc_to_html - Word docs might contain images.
683# When such word docs are converted with wvWare, we make it generate a
684# <filename>_files folder with the associated images, while the html file
685# <filename> refers to the images using absolute paths to <filename>_files.
686# This method reads in that html file and replaces all the absolute paths to
687# the images in <filename>_files with the relative paths to the images from
688# that folder. (I.e. with <filename>_files/<imagename.ext>).
689sub make_links_to_assocdir_relative{
690 # toppath is the top-level folder in which the html file we're going to be fixing resides
691 # docname is just the name (without extension) of the html file
692 # html_file is the full path to the html file: /full/path/docname.html
693 # assoc_dir_path is toppath/docname_files
694 # assoc_dirname is the directory name of the folder with associated imgs: docname_files
695 my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
696
697 # 1. Read all the contents of the html into a string
698 # open the original file for reading
699 unless(open(FIN, "<$html_file")) {
700 print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
701 return 0;
702 }
703 # From http://perl.plover.com/local.html
704 # "It's cheaper to read the file all at once, without all the splitting and reassembling.
705 # (Some people call this slurping the file.) Perl has a special feature to support this:
706 # If the $/ variable is undefined, the <...> operator will read the entire file all at once"
707 my $html_contents;
708 {
709 local $/ = undef; # Read entire file at once
710 $html_contents = <FIN>; # Now file is read in as one single 'line'
711 }
712 close(FIN); # close the file
713 #print STDERR $html_contents;
714
715 # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src
716 # values with assoc_dirname
717 # At the end: g means substitute all occurrences (global), while s at the end means treat
718 # all new lines as a regular space. This interacts with g to consider all the lines
719 # together as a single line so that multi-occurrences can be replaced.
720
721 # we can't just replace $assoc_dir_path with $assoc_dir
722 # $assoc_dir_path represents a regular expression that needs to be replaced
723 # if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
724 # meaning in Perl regular expressions -- we need to escape these first
725 my $safe_reg_expression = $assoc_dir_path;
726 $safe_reg_expression =~ s/\\/\\\\/g;
727 $safe_reg_expression =~ s/\./\\./g;
728 $safe_reg_expression =~ s/\-/\\-/g;
729 $safe_reg_expression =~ s/\[/\\[/g;
730 $safe_reg_expression =~ s/\]/\\]/g;
731 $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
732
733 # The following regular expression substitution looks for <a or <image, followed by any other
734 # attributes and values until it comes to the FIRST (indicated by ?) href= or src=
735 # followed by " or ' no quotes at all around path, followed by the associated folder's pathname
736 # followed by characters (for the img filename), then finally the optional closing quotes
737 # in " or ' form, followed by any other attributes and values until the first > to end the tag.
738 # The substitution: all the parts preceding associated folder's pathname are retained,
739 # the associated folder path name is replaced by associated folder directory name
740 # and the rest upto and including the closing > tag is retained.
741 # The sg at the end of the pattern match treats all of html_contents as a single line (s)
742 # and performs a global replace (g) meaning that all occurrences that match in that single line
743 # are substituted.
744 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg;
745 #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
746 # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
747 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
748
749 #print STDERR "****assoc_dirname: $assoc_dirname***\n";
750 #print STDERR "****safe_reg_expression: $safe_reg_expression***\n";
751
752 # delete the original file and recreate it
753 my $copy_of_filename = $html_file;
754 &util::rm($copy_of_filename); # deleted the file
755
756 # Recreate the original file for writing the updated contents
757 unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
758 print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
759 return 0;
760 }
761
762 # write out the updated contents and close the file
763 print FOUT $html_contents;
764 close(FOUT);
765 return 1;
766}
767
768# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
769# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
770# introduced in link pathnames by wvWare into space again. Converts all percent signs
771# introduced by URL encoding filenames generated into %25 in these url links referencing them
772sub post_process_assocfile_urls
773{
774 my ($pre, $text, $post) = @_;
775
776 $text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
777 # $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
778 $text =~ s/\\/\//g;
779 $text =~ s/%/%25/g;
780
781 return "$pre$text$post";
782}
783
784# Attempt to convert a word document to html with the word2html scripting program
785sub native_doc_to_html {
786 my ($input_filename, $output_filestem) = @_;
787
788 # build up the path to the doc-to-html conversion tool we're going to use
789 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
790
791 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
792 # if windows scripting with docx input, use new VBscript to get the local Word install (if
793 # any) to do the conversion, since docX can't be processed by word2html's windows_scripting
794
795 if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
796 # else script launch fails when there are error msgs
797 $vbScript = &util::filename_cat($vbScript, "docx2html.vbs");
798 $vbScript = "CScript //Nologo \"$vbScript\""; # launche with CScript for error output in STDERR
799 # //Nologo flag avoids Microsoft's opening/logo msgs
800 print STDERR "About to use windows scripting to process docx file $input_filename.\n";
801 print STDERR " This may take some time. Please wait...\n";
802 }
803 else { # old doc versions. use the usual VB executable word2html for the
804 # conversion. Doesn't need full path, since bin\windows is on PATH
805 $vbScript = "word2html"; #$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
806 }
807 }
808 else { # not windows
809 $vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
810 }
811
812 if (-e "$output_filestem.html") {
813 print STDERR " The conversion file:\n";
814 print STDERR " $output_filestem.html\n";
815 print STDERR " ... already exists. Skipping\n";
816 return 1;
817 }
818
819 my $cmd = "";
820 if ($timeout) {$cmd = "ulimit -t $timeout;";}
821 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
822 #$cmd .= "$vbScript $input_filename $output_filestem.html";
823 $cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
824
825 # redirecting STDERR
826
827 $cmd .= " 2> \"$output_filestem.err\""
828 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
829 #print STDERR "@@@@@@@@@ cmd=$cmd\n";
830
831 # execute the command
832 $!=0;
833 if (system($cmd)!=0)
834 {
835 print STDERR "Error executing $vbScript converter:$!\n";
836 if (-s "$output_filestem.err") {
837 open (ERRFILE, "<$output_filestem.err");
838
839 my $write_to_fail_log=0;
840 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
841 {$write_to_fail_log=1;}
842
843 my $line;
844 while ($line=<ERRFILE>) {
845 if ($line =~ m/\w/) {
846 print STDERR "$line";
847 print FAILLOG "$line" if ($write_to_fail_log);
848 }
849 if ($line !~ m/startup error/) {next;}
850 print STDERR " (given an invalid .DOC file?)\n";
851 print FAILLOG " (given an invalid .DOC file?)\n"
852 if ($write_to_fail_log);
853
854 } # while ERRFILE
855 close FAILLOG if ($write_to_fail_log);
856 }
857 return 0; # we can try any_to_text
858 }
859
860 # Was the conversion successful?
861 if (-s "$output_filestem.html") {
862 open(TMP, "$output_filestem.html");
863 my $line = <TMP>;
864 close(TMP);
865 if ($line && $line =~ m/html/i) {
866 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
867 return 1;
868 }
869 }
870
871 # If here, an error of some sort occurred
872 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
873 if (-e "$output_filestem.err") {
874 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
875 open (ERRLOG,"$output_filestem.err");
876 while (<ERRLOG>) {print FAILLOG $_;}
877 close FAILLOG;
878 close ERRLOG;
879 }
880 &util::rm("$output_filestem.err");
881 }
882 return 0;
883}
884
885# Attempt to convert an RTF document to html with rtftohtml
886sub rtf_to_html {
887 my ($input_filename, $output_filestem) = @_;
888
889 # formulate the command
890 my $cmd = "";
891 if ($timeout) {$cmd = "ulimit -t $timeout;";}
892 $cmd .= "rtftohtml";
893 #$cmd .= "rtf-converter";
894
895 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
896
897 $cmd .= " 2>\"$output_filestem.err\""
898 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
899
900
901 # execute the command
902 $!=0;
903 if (system($cmd)!=0)
904 {
905 print STDERR "Error executing rtf converter $!\n";
906 # don't currently bother printing out error log...
907 # keep going, in case it still created an HTML file...
908 }
909
910 # Was the conversion successful?
911 my $was_successful=0;
912 if (-s "$output_filestem.html") {
913 # make sure we have some content other than header
914 open (HTML, "$output_filestem.html"); # what to do if fail?
915 my $line;
916 my $past_header=0;
917 while ($line=<HTML>) {
918
919 if ($past_header == 0) {
920 if ($line =~ m/<body>/) {$past_header=1;}
921 next;
922 }
923
924 $line =~ s/<[^>]+>//g;
925 if ($line =~ m/\w/ && $past_header) { # we found some content...
926 $was_successful=1;
927 last;
928 }
929 }
930 close HTML;
931 }
932
933 if ($was_successful) {
934 &util::rm("$output_filestem.err")
935 if (-e "$output_filestem.err");
936 # insert the (modified) table of contents, if it exists.
937 if (-e "${output_filestem}_ToC.html") {
938 &util::mv("$output_filestem.html","$output_filestem.src");
939 my $open_failed=0;
940 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
941 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
942 open HTML, ">$output_filestem.html" || ++$open_failed;
943
944 if ($open_failed) {
945 close HTMLSRC;
946 close TOC;
947 close HTML;
948 &util::mv("$output_filestem.src","$output_filestem.html");
949 return 1;
950 }
951
952 # print out header info from src html.
953 while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
954 print HTML "$_";
955 }
956
957 # print out table of contents, making links relative
958 <TOC>; <TOC>; # ignore first 2 lines
959 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
960 my $line;
961 while ($line=<TOC>) {
962 $line =~ s@</body></html>$@@i ; # only last line has this
963 # make link relative
964 $line =~ s@href=\"[^\#]+@href=\"@i;
965 print HTML $line;
966 }
967 close TOC;
968
969 # rest of html src
970 while (<HTMLSRC>) {
971 print HTML $_;
972 }
973 close HTMLSRC;
974 close HTML;
975
976 &util::rm("${output_filestem}_ToC.html");
977 &util::rm("${output_filestem}.src");
978 }
979 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
980 return 1; # success
981 }
982
983 if (-e "$output_filestem.err") {
984 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
985 {
986 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
987 #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
988 print FAILLOG " (rtf file might be too recent):\n";
989 open (ERRLOG, "$output_filestem.err");
990 while (<ERRLOG>) {print FAILLOG $_;}
991 close ERRLOG;
992 close FAILLOG;
993 }
994 &util::rm("$output_filestem.err");
995 }
996
997 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
998
999 return 0;
1000}
1001
1002
1003# Convert a pdf file to html with the pdftohtml command
1004
1005sub pdf_to_html {
1006 my ($dirname, $input_filename, $output_filestem) = @_;
1007
1008 my $cmd = "";
1009 if ($timeout) {$cmd = "ulimit -t $timeout;";}
1010 my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
1011 $cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
1012 $cmd .= " -c" if ($pdf_complex);
1013 $cmd .= " -i" if ($pdf_ignore_images);
1014 $cmd .= " -a" if ($pdf_allow_images_only);
1015 $cmd .= " -hidden" unless ($pdf_nohidden);
1016 $cmd .= " \"$input_filename\" \"$output_filestem\"";
1017
1018 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
1019 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1020 } else {
1021 $cmd .= " > \"$output_filestem.err\"";
1022 }
1023
1024 $!=0;
1025
1026 my $retval=system($cmd);
1027 if ($retval!=0)
1028 {
1029 print STDERR "Error executing pdftohtml.pl";
1030 if ($!) {print STDERR ": $!";}
1031 print STDERR "\n";
1032 }
1033
1034 # make sure the converter made something
1035 if ($retval!=0 || ! -s "$output_filestem.html")
1036 {
1037 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1038 # print out the converter's std err, if any
1039 if (-s "$output_filestem.err") {
1040 open (ERRLOG, "$output_filestem.err") || die "$!";
1041 print STDERR "pdftohtml error log:\n";
1042 while (<ERRLOG>) {
1043 print STDERR "$_";
1044 }
1045 close ERRLOG;
1046 }
1047 print STDERR "***********output filestem $output_filestem.html\n";
1048 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1049 if (-e "$output_filestem.err") {
1050 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1051 {
1052 open (ERRLOG, "$output_filestem.err");
1053 while (<ERRLOG>) {print FAILLOG $_;}
1054 close ERRLOG;
1055 close FAILLOG;
1056 }
1057 &util::rm("$output_filestem.err");
1058 }
1059 return 0;
1060 }
1061
1062 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1063 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1064 return 1;
1065}
1066
1067# Convert a pdf file to various types of image with the convert command
1068
1069sub pdfps_to_img {
1070 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
1071
1072 # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
1073 if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
1074 my $result = `identify 2>&1`;
1075 if ($? == -1 || $? == 256) { # Linux and Windows return different values for "program not found"
1076 #ImageMagick is not installed, thus the convert utility is not available.
1077 print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
1078 return 0;
1079 }
1080 }
1081
1082 my $cmd = "";
1083 if ($timeout) {$cmd = "ulimit -t $timeout;";}
1084 $output_type =~ s/.*\_(.*)/$1/i;
1085 my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
1086 $cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
1087 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
1088 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1089 } else {
1090 $cmd .= " > \"$output_filestem.err\"";
1091 }
1092
1093 # don't include path on windows (to avoid having to play about
1094 # with quoting when GSDLHOME might contain spaces) but assume
1095 # that the PATH is set up correctly
1096 $!=0;
1097 my $retval=system($cmd);
1098 if ($retval!=0)
1099 {
1100 print STDERR "Error executing pdftoimg.pl";
1101 if ($!) {print STDERR ": $!";}
1102 print STDERR "\n";
1103 }
1104
1105 #make sure the converter made something
1106 #if ($retval !=0) || ! -s "$output_filestem")
1107 if ($retval !=0)
1108 {
1109 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1110 #print out the converter's std err, if any
1111 if (-s "$output_filestem.err") {
1112 open (ERRLOG, "$output_filestem.err") || die "$!";
1113 print STDERR "pdfpstoimg error log:\n";
1114 while (<ERRLOG>) {
1115 print STDERR "$_";
1116 }
1117 close ERRLOG;
1118 }
1119 #&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1120 if (-e "$output_filestem.err") {
1121 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1122 {
1123 open (ERRLOG, "$output_filestem.err");
1124 while (<ERRLOG>) {print FAILLOG $_;}
1125 close ERRLOG;
1126 close FAILLOG;
1127 }
1128 &util::rm("$output_filestem.err");
1129 }
1130 return 0;
1131 }
1132 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1133 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1134 return 1;
1135}
1136
1137# Convert a PDF file to text with the pdftotext command
1138
1139sub pdf_to_text {
1140 my ($dirname, $input_filename, $output_filestem) = @_;
1141
1142 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1143
1144 if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1145 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1146 } else {
1147 $cmd .= " > \"$output_filestem.err\"";
1148 }
1149
1150 if (system($cmd)!=0)
1151 {
1152 print STDERR "Error executing $cmd: $!\n";
1153 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1154 }
1155
1156 # make sure there is some extracted text.
1157 if (-e "$output_filestem.text") {
1158 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
1159 binmode(EXTR_TEXT); # just in case...
1160 my $line="";
1161 my $seen_text=0;
1162 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1163 if ($line=~ m/\w/) {$seen_text=1;}
1164 }
1165 close EXTR_TEXT;
1166 if ($seen_text==0) { # no text was extracted
1167 print STDERR "Error: pdftotext found no text\n";
1168 &util::rm("$output_filestem.text");
1169 }
1170 }
1171
1172 # make sure the converter made something
1173 if (! -s "$output_filestem.text")
1174 {
1175 # print out the converters std err, if any
1176 if (-s "$output_filestem.err") {
1177 open (ERRLOG, "$output_filestem.err") || die "$!";
1178 print STDERR "pdftotext error log:\n";
1179 while (<ERRLOG>) {
1180 print STDERR "$_";
1181 }
1182 close ERRLOG;
1183 }
1184 # does this converter create a .out file?
1185 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1186 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1187 if (-e "$output_filestem.err") {
1188 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1189 {
1190 open (ERRLOG,"$output_filestem.err");
1191 while (<ERRLOG>) {print FAILLOG $_;}
1192 close ERRLOG;
1193 close FAILLOG;
1194 }
1195 &util::rm("$output_filestem.err");
1196 }
1197 return 0;
1198 }
1199 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1200 return 1;
1201}
1202
1203# Convert a PostScript document to text
1204# note - just using "ps2ascii" isn't good enough, as it
1205# returns 0 for a postscript interpreter error. ps2ascii is just
1206# a wrapper to "gs" anyway, so we use that cmd here.
1207
1208sub ps_to_text {
1209 my ($input_filename, $output_filestem) = @_;
1210
1211 my $error = "";
1212
1213 # if we're on windows we'll fall straight through without attempting
1214 # to use gs
1215 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1216 $error = "Windows does not support gs";
1217
1218 } else {
1219 my $cmd = "";
1220 if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1221 $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1222 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1223 #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1224 $cmd .= " 2> $output_filestem.err";
1225 $!=0;
1226
1227 my $retcode=system($cmd);
1228 $retcode = $? >> 8; # see man perlfunc - system for this...
1229 # if system returns -1 | 127 (couldn't start program), look at $! for message
1230
1231 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1232 elsif (! -e "$output_filestem.text") {
1233 $error="did not create output file.\n";
1234 }
1235 else
1236 { # make sure the interpreter didn't get an error. It is technically
1237 # possible for the actual text to start with this, but....
1238 open PSOUT, "$output_filestem.text";
1239 if (<PSOUT> =~ m/^Error: (.*)/) {
1240 $error="interpreter error - \"$1\"";
1241 }
1242 close PSOUT;
1243 }
1244 }
1245
1246 if ($error ne "")
1247 {
1248 print STDERR "Warning: Error executing gs: $error\n";
1249 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1250
1251 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1252 {
1253 print FAILLOG "gs - $error\n";
1254 if (-e "$output_filestem.err") {
1255 open(ERRLOG, "$output_filestem.err");
1256 while (<ERRLOG>) {print FAILLOG $_;}
1257 close ERRLOG;
1258 }
1259 close FAILLOG;
1260 }
1261 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1262
1263
1264 # Fine then. We'll just do a lousy job by ourselves...
1265 # Based on 5-line regexp sed script found at:
1266 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1267 #
1268 print STDERR "Stripping text from postscript\n";
1269 my $errorcode=0;
1270 open (IN, "$input_filename")
1271 || ($errorcode=1, warn "Couldn't read file: $!");
1272 open (OUT, ">$output_filestem.text")
1273 || ($errorcode=1, warn "Couldn't write file: $!");
1274 if ($errorcode) {print STDERR "errors\n";return 0;}
1275
1276 my $text=""; # this is for whole .ps file...
1277 $text = join('', <IN>); # see man perlport, under "System Resources"
1278 close IN;
1279
1280 # Make sure this is a ps file...
1281 if ($text !~ m/^%!/) {
1282 print STDERR "Bad postscript header: not '%!'\n";
1283 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1284 {
1285 print FAILLOG "Bad postscript header: not '%!'\n";
1286 close FAILLOG;
1287 }
1288 return 0;
1289 }
1290
1291 # if ps has Page data, then use it to delete all stuff before it.
1292 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1293
1294 # remove all leading non-data stuff
1295 $text =~ s/^.*?\(//s;
1296
1297 # remove all newline chars for easier processing
1298 $text =~ s/\n//g;
1299
1300 # Big assumption here - assume that if any co-ordinates are
1301 # given, then we are at the end of a sentence.
1302 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1303
1304 # special characters--
1305 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1306
1307 # ? ps text formatting (eg italics?) ?
1308 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1309 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1310 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1311 # default - remove the rest
1312 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1313
1314 # attempt to add whitespace between words...
1315 # this is based purely on observation, and may be completely wrong...
1316 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1317 # eg I notice "b(" is sometimes NOT a space if preceded by a
1318 # negative number.
1319 $text =~ s/\)\d+ ?b\(/\) \( /g;
1320
1321 # change quoted braces to brackets
1322 $text =~ s/([^\\])\\\(/$1\{/g;
1323 $text =~ s/([^\\])\\\)/$1\}/g ;
1324
1325 # remove everything that is not between braces
1326 $text =~ s/\)([^\(\)])+?\(//sg ;
1327
1328 # remove any Trailer eof stuff.
1329 $text =~ s/\)[^\)]*$//sg;
1330
1331 ### ligatures have special characters...
1332 $text =~ s/\\013/ff/g;
1333 $text =~ s/\\014/fi/g;
1334 $text =~ s/\\015/fl/g;
1335 $text =~ s/\\016/ffi/g;
1336 $text =~ s/\\214/fi/g;
1337 $text =~ s/\\215/fl/g;
1338 $text =~ s/\\017/\n\* /g; # asterisk?
1339 $text =~ s/\\023/\023/g; # e acute ('e)
1340 $text =~ s/\\177/\252/g; # u"
1341# $text =~ s/ ?? /\344/g; # a"
1342
1343 print OUT "$text";
1344 close OUT;
1345 }
1346 # wrap the text - use a minimum length. ie, first space after this length.
1347 my $wrap_length=72;
1348 &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1349 open INFILE, "$output_filestem.text.tmp" ||
1350 die "Couldn't open file: $!";
1351 open OUTFILE, ">$output_filestem.text" ||
1352 die "Couldn't open file for writing: $!";
1353 my $line="";
1354 while ($line=<INFILE>) {
1355 while (length($line)>0) {
1356 if (length($line)>$wrap_length) {
1357 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1358 print OUTFILE "$1\n";
1359 } else {
1360 print OUTFILE "$line";
1361 $line="";
1362 }
1363 }
1364 }
1365 close INFILE;
1366 close OUTFILE;
1367 &util::rm("$output_filestem.text.tmp");
1368
1369 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1370 return 1;
1371}
1372
1373
1374# Convert any file to HTML with a crude perl implementation of the
1375# UNIX strings command.
1376
1377sub any_to_html {
1378 my ($input_filename, $output_filestem) = @_;
1379
1380 # First generate a text file
1381 return 0 unless (&any_to_text($input_filename, $output_filestem));
1382
1383 # create an HTML file from the text file
1384 open(TEXT, "<$output_filestem.text");
1385 open(HTML, ">$output_filestem.html");
1386
1387 print HTML "<html><head>\n";
1388 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1389 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1390 print HTML "</head><body>\n\n";
1391
1392 my $line;
1393 while ($line=<TEXT>) {
1394 $line =~ s/</&lt;/g;
1395 $line =~ s/>/&gt;/g;
1396 if ($line =~ m/^\s*$/) {
1397 print HTML "<p>";
1398 } else {
1399 print HTML "<br> ", $line;
1400 }
1401 }
1402 print HTML "\n</body></html>\n";
1403
1404 close HTML;
1405 close TEXT;
1406
1407 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1408 return 1;
1409}
1410
1411# Convert any file to TEXT with a crude perl implementation of the
1412# UNIX strings command.
1413# Note - this assumes ascii charsets :( (jrm21)
1414
1415sub any_to_text {
1416 my ($input_filename, $output_filestem) = @_;
1417
1418 if (!$use_strings) {
1419 return 0;
1420 }
1421
1422 print STDERR "\n**** In any to text****\n\n";
1423 open(IN, "<$input_filename") || return 0;
1424 binmode(IN);
1425 open(OUT, ">$output_filestem.text") || return 0;
1426
1427 my ($line);
1428 my $output_line_count = 0;
1429 while (<IN>) {
1430 $line = $_;
1431
1432 # delete anything that isn't a printable character
1433 $line =~ s/[^\040-\176]+/\n/sg;
1434
1435 # delete any string less than 10 characters long
1436 $line =~ s/^.{0,9}$/\n/mg;
1437 while ($line =~ m/^.{1,9}$/m) {
1438 $line =~ s/^.{0,9}$/\n/mg;
1439 $line =~ s/\n+/\n/sg;
1440 }
1441
1442 # remove extraneous whitespace
1443 $line =~ s/\n+/\n/gs;
1444 $line =~ s/^\n//gs;
1445
1446 # output whatever is left
1447 if ($line =~ m/[^\n ]/) {
1448 print OUT $line;
1449 ++$output_line_count;
1450 }
1451 }
1452
1453 close OUT;
1454 close IN;
1455
1456 if ($output_line_count) { # try to protect against binary only formats
1457 return 1;
1458 }
1459
1460 &util::rm("$output_filestem.text");
1461 return 0;
1462
1463}
Note: See TracBrowser for help on using the repository browser.