source: gsdl/trunk/bin/script/gsConvert.pl@ 19763

Last change on this file since 19763 was 19763, checked in by ak19, 15 years ago

No longer convert spaces to underscores in the rename_file subroutine, since underscores mess up incremental build (file renaming forces incremental building to rebuild everything again since incr building thinks the file with the original file has been deleted and new files have been added).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 44.0 KB
RevLine 
[1445]1#!/usr/bin/perl -w
2
3###########################################################################
4#
[2032]5# gsConvert.pl -- convert documents to HTML or TEXT format
[1445]6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
[3013]11# Copyright (C) 1999-2002 New Zealand Digital Library Project
[1445]12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
[2755]30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
[1445]33#
[3013]34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
[2032]38#
[3013]39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
[2032]42# We try to convert Postscript files to text using "gs" which is often on
[2755]43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
[1445]45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use parsargv;
52use util;
53use Cwd;
54use File::Basename;
55
[2755]56# Are we running on WinNT or Win2000 (or later)?
57my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
58if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
[1445]59
[3350]60my $use_strings;
[3720]61my $pdf_complex;
[4103]62my $pdf_nohidden;
[3720]63my $pdf_zoom;
64my $pdf_ignore_images;
[10451]65my $pdf_allow_images_only;
[10282]66my $windows_scripting;
[3350]67
[1445]68sub print_usage
69{
[1970]70 print STDERR "\n";
71 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
72 print STDERR " or text using third-party programs.\n\n";
73 print STDERR " usage: $0 [options] filename\n";
[3400]74 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
[2755]75 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
[17195]76 print STDERR "\t-output\tauto|html|text|pagedimage_jpg|pagedimage_gif|pagedimage_png\t(output file type)\n";
[2755]77 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
[3720]78 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
[10282]79 print STDERR "\t-windows_scripting\tuse windows script when converting Microsoft Word and PPT via VB script\n";
[3720]80 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
[4103]81 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
[3720]82 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
83 print STDERR "\t\tconverting PDF to HTML\n";
[10451]84 print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
[3720]85 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
86 print STDERR "\t\t-pdf_complex is set\n";
[1445]87 exit(1);
88}
89
[2755]90my $faillogfile="";
[3538]91my $timeout=0;
[1445]92
93sub main
94{
95 my (@ARGV) = @_;
[3538]96 my ($input_type,$output_type,$verbose);
[1960]97
[1445]98 # read command-line arguments
99 if (!parsargv::parse(\@ARGV,
[3400]100 'type/(doc|dot|pdf|ps|ppt|rtf|xls)/', \$input_type,
[2755]101 '/errlog/.*/', \$faillogfile,
[17194]102 'output/(auto|html|text|pagedimage).*/', \$output_type,
[1692]103 'timeout/\d+/0',\$timeout,
[10282]104 'verbose/\d+/0', \$verbose,
[3720]105 'use_strings', \$use_strings,
[10282]106 'windows_scripting',\$windows_scripting,
[3720]107 'pdf_complex', \$pdf_complex,
[9482]108 'pdf_ignore_images', \$pdf_ignore_images,
[10451]109 'pdf_allow_images_only', \$pdf_allow_images_only,
[4103]110 'pdf_nohidden', \$pdf_nohidden,
[3720]111 'pdf_zoom/\d+/2', \$pdf_zoom
112 ))
[1445]113 {
114 print_usage();
115 }
[12704]116
[1445]117 # Make sure the input file exists and can be opened for reading
118 if (scalar(@ARGV!=1)) {
119 print_usage();
120 }
[1928]121
[1445]122 my $input_filename = $ARGV[0];
123 if (!-r $input_filename) {
124 print STDERR "Error: unable to open $input_filename for reading\n";
125 exit(1);
126 }
127
128 # Deduce filenames
129 my ($tailname,$dirname,$suffix)
[2241]130 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
131 my $output_filestem = &util::filename_cat($dirname, "$tailname");
[1445]132
133 if ($input_type eq "")
134 {
[2241]135 $input_type = lc (substr($suffix,1,length($suffix)-1));
[1445]136 }
137
138 # Change to temporary working directory
139 my $stored_dir = cwd();
140 chdir ($dirname) || die "Unable to change to directory $dirname";
[10357]141
[1445]142 # Select convert utility
143 if (!defined $input_type) {
144 print STDERR "Error: No filename extension or input type defined\n";
145 exit(1);
146 }
[3400]147 elsif ($input_type eq "doc" || $input_type eq "dot") {
[1445]148 print &convertDOC($input_filename, $output_filestem, $output_type);
149 print "\n";
150 }
[1684]151 elsif ($input_type eq "rtf") {
152 print &convertRTF($input_filename, $output_filestem, $output_type);
153 print "\n";
154 }
[1445]155 elsif ($input_type eq "pdf") {
156 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
157 print "\n";
158 }
159 elsif ($input_type eq "ps") {
160 print &convertPS($input_filename, $output_filestem, $output_type);
161 print "\n";
162 }
[2977]163 elsif ($input_type eq "ppt") {
164 print &convertPPT($input_filename, $output_filestem, $output_type);
165 print "\n";
166 }
[2991]167 elsif ($input_type eq "xls") {
168 print &convertXLS($input_filename, $output_filestem, $output_type);
169 print "\n";
170 }
[1445]171 else {
172 print STDERR "Error: Unable to convert type '$input_type'\n";
173 exit(1);
174 }
175
176 # restore to original working directory
177 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
178
179}
180
181&main(@ARGV);
182
183
184
[2241]185# Document-type conversion functions
[1445]186#
187# The following functions attempt to convert documents from their
188# input type to the specified output type. If no output type was
189# given, then they first attempt HTML, and then TEXT.
190#
191# Each returns the output type ("html" or "text") or "fail" if no
192# conversion is possible.
193
194# Convert a Microsoft word document
195
196sub convertDOC {
197 ($input_filename, $output_filestem, $output_type) = @_;
198
[1654]199 # Many .doc files are not in fact word documents!
200 my $realtype = &find_docfile_type($input_filename);
201
[1734]202 if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
[1654]203 return &convertWord678($input_filename, $output_filestem, $output_type);
204 } elsif ($realtype eq "rtf") {
205 return &convertRTF($input_filename, $output_filestem, $output_type);
206 } else {
207 return &convertAnything($input_filename, $output_filestem, $output_type);
208 }
209}
210
211# Convert a Microsoft word 6/7/8 document
212
213sub convertWord678 {
214 ($input_filename, $output_filestem, $output_type) = @_;
215
[1445]216 my $success = 0;
[16435]217 if (!$output_type || ($output_type =~ m/html/i)){
[10282]218 if ($windows_scripting) {
219 $success = &native_doc_to_html($input_filename, $output_filestem);
220 }
221 else {
222 $success = &doc_to_html($input_filename, $output_filestem);
223 }
[1445]224 if ($success) {
[10282]225 return "html";
[1445]226 }
227 }
[1654]228 return &convertAnything($input_filename, $output_filestem, $output_type);
229}
230
231
232# Convert a Rich Text Format (RTF) file
233
234sub convertRTF {
235 ($input_filename, $output_filestem, $output_type) = @_;
236
237 my $success = 0;
238
239 # Attempt specialised conversion to HTML
[16435]240 if (!$output_type || ($output_type =~ m/html/i)) {
[12704]241
242 if ($windows_scripting) {
243 $success = &native_doc_to_html($input_filename, $output_filestem);
244 }
245 else {
246 $success = &rtf_to_html($input_filename, $output_filestem);
247 }
[1654]248 if ($success) {
249 return "html";
250 }
251 }
252
[2755]253# rtf is so ugly that's it's not worth running strings over.
254# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
255# return &convertAnything($input_filename, $output_filestem, $output_type);
256 return "fail";
[1654]257}
258
259
260# Convert an unidentified file
261
262sub convertAnything {
263 ($input_filename, $output_filestem, $output_type) = @_;
264
265 my $success = 0;
[10464]266
[1445]267 # Attempt simple conversion to HTML
[16435]268 if (!$output_type || ($output_type =~ m/html/i)) {
[1445]269 $success = &any_to_html($input_filename, $output_filestem);
270 if ($success) {
271 return "html";
272 }
273 }
274
275 # Convert to text
[16435]276 if (!$output_type || ($output_type =~ m/text/i)) {
[2241]277 $success = &any_to_text($input_filename, $output_filestem);
[1445]278 if ($success) {
279 return "text";
280 }
281 }
282 return "fail";
283}
284
285
[1654]286
[1445]287# Convert an Adobe PDF document
288
289sub convertPDF {
[2755]290 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[1445]291
292 my $success = 0;
[10357]293 $output_type =~ s/.*\-(.*)/$1/i;
294 # Attempt coversion to Image
[16435]295 if ($output_type =~ m/jp?g|gif|png/i) {
[17329]296 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10357]297 if ($success){
298 return "item";
299 }
300 }
[1445]301
302 # Attempt conversion to HTML
[16435]303 if (!$output_type || ($output_type =~ m/html/i)) {
[1445]304 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
305 if ($success) {
306 return "html";
307 }
308 }
309
310 # Attempt conversion to TEXT
[16435]311 if (!$output_type || ($output_type =~ m/text/i)) {
[2117]312 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
[1445]313 if ($success) {
314 return "text";
315 }
316 }
317
318 return "fail";
319
320}
321
322
323# Convert an Adobe PostScript document
324
325sub convertPS {
326 ($input_filename, $output_filestem, $output_type) = @_;
327
328 my $success = 0;
[10534]329 $output_type =~ s/.*\-(.*)/$1/i;
330 # Attempt coversion to Image
[16435]331 if ($output_type =~ m/jp?g|gif|png/i) {
[17329]332 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
[10534]333 if ($success){
334 return "item";
335 }
336 }
[1445]337
338 # Attempt conversion to TEXT
[16435]339 if (!$output_type || ($output_type =~ m/text/i)) {
[1445]340 $success = &ps_to_text($input_filename, $output_filestem);
341 if ($success) {
342 return "text";
343 }
344 }
345 return "fail";
346}
347
348
[2977]349sub convertPPT {
350 my ($input_filename, $output_filestem, $output_type) = @_;
[10357]351 my $success = 0;
[2977]352
[10282]353 my $ppt_convert_type = "";
[16435]354 #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
355 if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
356 if ($output_type =~ m/gif/i) {
[10282]357 $ppt_convert_type = "-g";
[16435]358 } elsif ($output_type =~ m/jp?g/i){
[10282]359 $ppt_convert_type = "-j";
[16435]360 } elsif ($output_type =~ m/png/i){
[10282]361 $ppt_convert_type = "-p";
362 }
363 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
364 $ENV{'GSDLOS'}, "pptextract");
[16435]365 $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[10282]366
367 $cmd = "";
[10357]368 if ($timeout) {$cmd = "ulimit -t $timeout;";}
[10282]369 # if the converting directory has already existed
370 if (-d $output_filestem) {
371 print STDERR "**The conversion directory has existed\n";
372 return "item";
373 } else {
[10521]374 $cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
[10282]375 $cmd .= " 2>\"$output_filestem.err\""
[16435]376 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
[10282]377 if (system($cmd) !=0) {
378 print STDERR "Powerpoint VB Scripting convert failed\n";
379 } else {
380 return "item";
381 }
382 }
[16435]383 } elsif (!$output_type || ($output_type =~ m/html/i)) {
[10282]384 # Attempt conversion to HTML
[16435]385 #if (!$output_type || ($output_type =~ m/html/i)) {
[2977]386 # formulate the command
387 $cmd = "";
388 $cmd .= "perl -S ppttohtml.pl ";
389 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
390 $cmd .= " 2>\"$output_filestem.err\""
[16435]391 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
[10357]392
[2977]393 # execute the command
394 $!=0;
395 if (system($cmd)!=0)
396 {
[2991]397 print STDERR "Powerpoint 95/97 converter failed $!\n";
[2977]398 } else {
399 return "html";
400 }
[10464]401 }
[2977]402
403 $success = &any_to_text($input_filename, $output_filestem);
404 if ($success) {
405 return "text";
406 }
[10464]407
[2977]408 return "fail";
409}
410
411
[2991]412sub convertXLS {
413 my ($input_filename, $output_filestem, $output_type) = @_;
[2977]414
[2991]415 my $success = 0;
[2977]416
[2991]417 # Attempt conversion to HTML
[16435]418 if (!$output_type || ($output_type =~ m/html/i)) {
[2991]419 # formulate the command
420 $cmd = "";
421 $cmd .= "perl -S xlstohtml.pl ";
422 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
423 $cmd .= " 2>\"$output_filestem.err\""
[16435]424 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
[2991]425
426
427 # execute the command
428 $!=0;
429 if (system($cmd)!=0)
430 {
431 print STDERR "Excel 95/97 converter failed $!\n";
432 } else {
433 return "html";
434 }
435 }
[2977]436
[2991]437 $success = &any_to_text($input_filename, $output_filestem);
438 if ($success) {
439 return "text";
440 }
441
442 return "fail";
443}
444
445
446
[1654]447# Find the real type of a .doc file
448#
[2012]449# We seem to have a lot of files with a .doc extension that are .rtf
[1654]450# files or Word 5 files. This function attempts to tell the difference.
451sub find_docfile_type {
452 ($input_filename) = @_;
453
454 open(CHK, "<$input_filename");
[1734]455 binmode(CHK);
[1654]456 my $line = "";
457 my $first = 1;
458
459 while (<CHK>) {
460
461 $line = $_;
[1960]462
[1654]463 if ($first) {
464 # check to see if this is an rtf file
[16435]465 if ($line =~ m/^\{\\rtf/) {
[1654]466 close(CHK);
467 return "rtf";
468 }
[2755]469 $first = 0;
[1654]470 }
471
[1734]472 # is this is a word 6/7/8 document?
[16435]473 if ($line =~ m/Word\.Document\.([678])/) {
[1654]474 close(CHK);
[1734]475 return "word$1";
[1654]476 }
477
478 }
479
480 return "unknown";
481}
482
483
[1734]484# Specific type-to-type conversions
[1445]485#
486# Each of the following functions attempts to convert a document from
[2755]487# a specific format to another. If they succeed they return 1 and leave
[1445]488# the output document(s) in the appropriate place; if they fail they
489# return 0 and delete any working files.
490
491
492# Attempt to convert a word document to html with the wv program
493sub doc_to_html {
494 ($input_filename, $output_filestem) = @_;
495
[2023]496 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
497 $ENV{'GSDLOS'}, "wvWare");
[1928]498
[2241]499 # don't include path on windows (to avoid having to play about
500 # with quoting when GSDLHOME might contain spaces) but assume
501 # that the PATH is set up correctly
[16435]502 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[2241]503
[2512]504 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
[2574]505 "packages", "wv", "wvHtml.xml");
[1928]506
[15120]507 # Added the following to work with replace_srcdoc_with_html.pl:
508 # Make wvWare put any associated (image) files of the word doc into
509 # folder docname-without-extention_files. This folder should be at
510 # the same level as the html file generated from the doc.
511 # wvWare will take care of proper interlinking.
512
513 # This step is necessary for replace_srcdoc_with_html.pl which will
514 # move the html and associated files into the import folder. We
515 # want to ensure that the associated files won't overwrite similarly
516 # named items already in import. Hence we put them in a folder first
517 # (to which the html links properly) and that will allow
518 # replace_srcdoc_with_html.pl to move them safely to /import.
519
520 # To do all this, we need to use wvWare's --dir and --basename options
521 # where dir is the full path to the image folder directory and
522 # basename is the full path to the image folder appended to the name
523 # which is to be prepended to every image file:
524 # eg. if the images were to have names like sample0.jpg to sampleN.jpg,
525 # then the basename is "/full/path/to/imgdir/sample".
526 # In this case, basename is the full path to and name of the document.
527 # HOWEVER: basename always takes full path, not relative url, so
528 # the greenstone browser is unable to display the images (absolute paths
529 # cause it to give an "external link" message)
530 # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
531 # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
532 # "added --dir option to wvHtml so that pictures can be placed in
533 # a seperate directory"
534 # "running wvWare through IMP to view word documents as html. It gets
535 # invoked like this:
536 # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
537
538 # toppath is the folder where html is generated
539 # docname is the name (without extension) of the html to be generated
540 # suffix (extension) is thrown away
541 my ($docname, $toppath)
542 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
543
544 # We want the image folder generated to have the same name as windows
545 # would generate ($windows_scripting) when it converts from word to html.
546 # That is, foldername=docname_files
547 my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
548 #print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
549
550 # ensure this image directory exists
551 # if it exists already, just delete and recreate
552 if(-e $assoc_dir) {
553 &util::rm_r($assoc_dir);
554 }
555 &util::mk_dir($assoc_dir);
556
557 # the images are all going to be called image0, image1,..., imageN
558 my $img_basenames = &util::filename_cat($assoc_dir, $docname);
559
560 #print STDERR "****toppath: $toppath\n****docname: $docname\n;
561 #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
562 #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
563
[2241]564 my $cmd = "";
[1692]565 if ($timeout) {$cmd = "ulimit -t $timeout;";}
[15120]566 # wvWare's --dir and --basename options for image directory.
567 # Replaced the next line with the *2 lines* following it:
568 # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
569 $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
570 $cmd .= " --charset utf-8 --config \"$wv_conf\"";
[2241]571 $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
[15120]572
[2241]573 # redirecting STDERR is a bad idea on windows 95/98
574 $cmd .= " 2> \"$output_filestem.err\""
[16435]575 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
[1445]576 # execute the command
[2755]577 $!=0;
[2060]578 if (system($cmd)!=0)
[1445]579 {
[2755]580 print STDERR "Error executing wv converter:$!\n";
581 if (-s "$output_filestem.err") {
582 open (ERRFILE, "<$output_filestem.err");
583
584 my $write_to_fail_log=0;
585 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
586 {$write_to_fail_log=1;}
587
588 my $line;
589 while ($line=<ERRFILE>) {
[16435]590 if ($line =~ m/\w/) {
[2755]591 print STDERR "$line";
592 print FAILLOG "$line" if ($write_to_fail_log);
593 }
594 if ($line !~ m/startup error/) {next;}
595 print STDERR " (given an invalid .DOC file?)\n";
596 print FAILLOG " (given an invalid .DOC file?)\n"
597 if ($write_to_fail_log);
598
599 } # while ERRFILE
600 close FAILLOG if ($write_to_fail_log);
601 }
602 return 0; # we can try any_to_text
[1445]603 }
[1578]604
[1445]605 # Was the conversion successful?
[2241]606
[15120]607 if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
[1445]608 open(TMP, "$output_filestem.html");
609 $line = <TMP>;
610 close(TMP);
[16435]611 if ($line && $line =~ m/DOCTYPE HTML/) {
[15120]612 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
613
614 # Inserted this code to remove the images directory if it was still empty after
615 # the html was generated (in case there were no images in the word document)
[16435]616 if (&util::is_dir_empty($assoc_dir)) {
[15152]617 #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n";
[15120]618 &util::rm_r($assoc_dir);
619 } else { # there was an image folder (it was generated)
620 # Therefore, the html file generated contains absolute links to the images
[16435]621 # Replace them with relative links instead, so the folder can be moved elsewhere
[15152]622 &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
[15120]623 }
[1445]624 return 1;
625 }
626 }
[2755]627
628 # If here, an error of some sort occurred
629 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
630 if (-e "$output_filestem.err") {
631 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
632 open (ERRLOG,"$output_filestem.err");
633 while (<ERRLOG>) {print FAILLOG $_;}
634 close FAILLOG;
635 close ERRLOG;
636 }
637 &util::rm("$output_filestem.err");
638 }
639
[1445]640 return 0;
641}
642
[15120]643# Method to work with doc_to_html - Word docs might contain images.
644# When such word docs are converted with wvWare, we make it generate a
645# <filename>_files folder with the associated images, while the html file
646# <filename> refers to the images using absolute paths to <filename>_files.
647# This method reads in that html file and replaces all the absolute paths to
648# the images in <filename>_files with the relative paths to the images from
649# that folder. (I.e. with <filename>_files/<imagename.ext>).
650sub make_links_to_assocdir_relative{
651 # toppath is the top-level folder in which the html file we're going to be fixing resides
652 # docname is just the name (without extension) of the html file
653 # html_file is the full path to the html file: /full/path/docname.html
654 # assoc_dir_path is toppath/docname_files
655 # assoc_dirname is the directory name of the folder with associated imgs: docname_files
656 my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
[10357]657
[15120]658 # 1. Read all the contents of the html into a string
659 # open the original file for reading
660 unless(open(FIN, "<$html_file")) {
[15168]661 print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
[15152]662 return 0;
[15120]663 }
664 # From http://perl.plover.com/local.html
665 # "It's cheaper to read the file all at once, without all the splitting and reassembling.
666 # (Some people call this slurping the file.) Perl has a special feature to support this:
667 # If the $/ variable is undefined, the <...> operator will read the entire file all at once"
[15152]668 my $html_contents;
669 {
670 local $/ = undef; # Read entire file at once
671 $html_contents = <FIN>; # Now file is read in as one single 'line'
672 }
[15120]673 close(FIN); # close the file
[15152]674 #print STDERR $html_contents;
[15120]675
676 # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src
677 # values with assoc_dirname
678 # At the end: g means substitute all occurrences (global), while s at the end means treat
679 # all new lines as a regular space. This interacts with g to consider all the lines
680 # together as a single line so that multi-occurrences can be replaced.
[15152]681
682 # we can't just replace $assoc_dir_path with $assoc_dir
683 # $assoc_dir_path represents a regular expression that needs to be replaced
[16435]684 # if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
685 # meaning in Perl regular expressions -- we need to escape these first
[15152]686 my $safe_reg_expression = $assoc_dir_path;
[16435]687 $safe_reg_expression =~ s/\\/\\\\/g;
[15152]688 $safe_reg_expression =~ s/\./\\./g;
689 $safe_reg_expression =~ s/\-/\\-/g;
690 $safe_reg_expression =~ s/\[/\\[/g;
691 $safe_reg_expression =~ s/\]/\\]/g;
692 $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
693
[15120]694 # The following regular expression substitution looks for <a or <image, followed by any other
695 # attributes and values until it comes to the FIRST (indicated by ?) href= or src=
696 # followed by " or ' no quotes at all around path, followed by the associated folder's pathname
697 # followed by characters (for the img filename), then finally the optional closing quotes
698 # in " or ' form, followed by any other attributes and values until the first > to end the tag.
699 # The substitution: all the parts preceding associated folder's pathname are retained,
700 # the associated folder path name is replaced by associated folder directory name
701 # and the rest upto and including the closing > tag is retained.
702 # The sg at the end of the pattern match treats all of html_contents as a single line (s)
703 # and performs a global replace (g) meaning that all occurrences that match in that single line
704 # are substituted.
[15152]705 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg;
706 #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
707 # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
[16435]708 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
709
[16552]710 #print STDERR "****assoc_dirname: $assoc_dirname***\n";
711 #print STDERR "****safe_reg_expression: $safe_reg_expression***\n";
[15152]712
[15120]713 # delete the original file and recreate it
714 my $copy_of_filename = $html_file;
715 &util::rm($copy_of_filename); # deleted the file
716
717 # Recreate the original file for writing the updated contents
718 unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
[15168]719 print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
[15152]720 return 0;
[15120]721 }
[16435]722
[15120]723 # write out the updated contents and close the file
724 print FOUT $html_contents;
725 close(FOUT);
[15152]726 return 1;
[15120]727}
728
[16435]729# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
730# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
[16899]731# introduced in link pathnames by wvWare into space again. Converts all percent signs
732# introduced by URL encoding filenames generated into %25 in these url links referencing them
[16435]733sub post_process_assocfile_urls
[15120]734{
[15152]735 my ($pre, $text, $post) = @_;
736
[19763]737 $text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
738 # $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
[16435]739 $text =~ s/\\/\//g;
[16899]740 $text =~ s/%/%25/g;
[15152]741
742 return "$pre$text$post";
[15120]743}
744
[10282]745# Attempt to convert a word document to html with the word2html scripting program
746sub native_doc_to_html {
747 ($input_filename, $output_filestem) = @_;
[1445]748
[10282]749 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
750 $ENV{'GSDLOS'}, "word2html");
751
[16435]752 $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
[10445]753 if (-e "$output_filestem.html") {
754 print STDERR "*** The conversion file has existed\n";
755 return 1;
756 }
[10282]757
758 my $cmd = "";
759 if ($timeout) {$cmd = "ulimit -t $timeout;";}
760 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]761 #$cmd .= "$vbScript $input_filename $output_filestem.html";
[10521]762 $cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
[10445]763
[10282]764 # redirecting STDERR
765 $cmd .= " 2> \"$output_filestem.err\""
[16435]766 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
[10282]767
768 # execute the command
769 $!=0;
770 if (system($cmd)!=0)
771 {
772 print STDERR "Error executing word2Html converter:$!\n";
773 if (-s "$output_filestem.err") {
774 open (ERRFILE, "<$output_filestem.err");
775
776 my $write_to_fail_log=0;
777 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
778 {$write_to_fail_log=1;}
779
780 my $line;
781 while ($line=<ERRFILE>) {
[16435]782 if ($line =~ m/\w/) {
[10282]783 print STDERR "$line";
784 print FAILLOG "$line" if ($write_to_fail_log);
785 }
786 if ($line !~ m/startup error/) {next;}
787 print STDERR " (given an invalid .DOC file?)\n";
788 print FAILLOG " (given an invalid .DOC file?)\n"
789 if ($write_to_fail_log);
790
791 } # while ERRFILE
792 close FAILLOG if ($write_to_fail_log);
793 }
794 return 0; # we can try any_to_text
795 }
796
797 # Was the conversion successful?
798 if (-s "$output_filestem.html") {
799 open(TMP, "$output_filestem.html");
800 $line = <TMP>;
801 close(TMP);
[16435]802 if ($line && $line =~ m/html/) {
[10282]803 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
804 return 1;
805 }
806 }
807
808 # If here, an error of some sort occurred
809 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
810 if (-e "$output_filestem.err") {
811 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
812 open (ERRLOG,"$output_filestem.err");
813 while (<ERRLOG>) {print FAILLOG $_;}
814 close FAILLOG;
815 close ERRLOG;
816 }
817 &util::rm("$output_filestem.err");
818 }
819 return 0;
820}
821
[1654]822# Attempt to convert an RTF document to html with rtftohtml
823
824sub rtf_to_html {
[2241]825 my ($input_filename, $output_filestem) = @_;
[1654]826
827 # formulate the command
[1692]828 $cmd = "";
829 if ($timeout) {$cmd = "ulimit -t $timeout;";}
[2574]830 $cmd .= "rtftohtml";
[10282]831 #$cmd .= "rtf-converter";
[1654]832
[3246]833 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
[2574]834
835 $cmd .= " 2>\"$output_filestem.err\""
[16435]836 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
[2574]837
838
[1654]839 # execute the command
[2755]840 $!=0;
[2060]841 if (system($cmd)!=0)
[1654]842 {
[2755]843 print STDERR "Error executing rtf converter $!\n";
[2656]844 # don't currently bother printing out error log...
845 # keep going, in case it still created an HTML file...
[1654]846 }
847
848 # Was the conversion successful?
[2755]849 my $was_successful=0;
[2656]850 if (-s "$output_filestem.html") {
[2755]851 # make sure we have some content other than header
852 open (HTML, "$output_filestem.html"); # what to do if fail?
853 my $line;
854 my $past_header=0;
855 while ($line=<HTML>) {
856
857 if ($past_header == 0) {
[16435]858 if ($line =~ m/<body>/) {$past_header=1;}
[2755]859 next;
860 }
861
862 $line =~ s/<[^>]+>//g;
[16435]863 if ($line =~ m/\w/ && $past_header) { # we found some content...
[2755]864 $was_successful=1;
865 last;
866 }
867 }
868 close HTML;
[1654]869 }
[2574]870
[2755]871 if ($was_successful) {
872 &util::rm("$output_filestem.err")
873 if (-e "$output_filestem.err");
874 # insert the (modified) table of contents, if it exists.
875 if (-e "${output_filestem}_ToC.html") {
876 &util::mv("$output_filestem.html","$output_filestem.src");
877 my $open_failed=0;
878 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
879 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
880 open HTML, ">$output_filestem.html" || ++$open_failed;
881
882 if ($open_failed) {
883 close HTMLSRC;
884 close TOC;
885 close HTML;
886 &util::mv("$output_filestem.src","$output_filestem.html");
887 return 1;
888 }
889
890 # print out header info from src html.
[16435]891 while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
[2755]892 print HTML "$_";
893 }
894
895 # print out table of contents, making links relative
896 <TOC>; <TOC>; # ignore first 2 lines
897 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
898 my $line;
899 while ($line=<TOC>) {
900 $line =~ s@</body></html>$@@ ; # only last line has this
901 # make link relative
902 $line =~ s@href=\"[^\#]+@href=\"@;
903 print HTML $line;
904 }
905 close TOC;
906
907 # rest of html src
908 while (<HTMLSRC>) {
909 print HTML $_;
910 }
911 close HTMLSRC;
912 close HTML;
913
914 &util::rm("${output_filestem}_ToC.html");
915 &util::rm("${output_filestem}.src");
916 }
917 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
918 return 1; # success
919 }
920
921 if (-e "$output_filestem.err") {
922 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
923 {
924 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
[10282]925 #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
[2755]926 print FAILLOG " (rtf file might be too recent):\n";
927 open (ERRLOG, "$output_filestem.err");
928 while (<ERRLOG>) {print FAILLOG $_;}
929 close ERRLOG;
930 close FAILLOG;
931 }
932 &util::rm("$output_filestem.err");
933 }
934
[2656]935 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
936
[1654]937 return 0;
938}
939
940
[1445]941# Convert a pdf file to html with the pdftohtml command
942
943sub pdf_to_html {
[2755]944 my ($dirname, $input_filename, $output_filestem) = @_;
[1445]945
[1692]946 $cmd = "";
947 if ($timeout) {$cmd = "ulimit -t $timeout;";}
[3720]948 $cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
949 $cmd .= " -c" if ($pdf_complex);
950 $cmd .= " -i" if ($pdf_ignore_images);
[10451]951 $cmd .= " -a" if ($pdf_allow_images_only);
[4103]952 $cmd .= " -hidden" unless ($pdf_nohidden);
[1928]953 $cmd .= " \"$input_filename\" \"$output_filestem\"";
[2755]954
[16435]955 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
[2755]956 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
957 } else {
958 $cmd .= " > \"$output_filestem.err\"";
959 }
960
[2117]961 $!=0;
[2241]962
[2656]963 my $retval=system($cmd);
964 if ($retval!=0)
[1445]965 {
[2755]966 print STDERR "Error executing pdftohtml.pl";
[2117]967 if ($!) {print STDERR ": $!";}
968 print STDERR "\n";
[1445]969 }
970
[1692]971 # make sure the converter made something
[2656]972 if ($retval!=0 || ! -s "$output_filestem.html")
[1692]973 {
974 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
[2656]975 # print out the converter's std err, if any
976 if (-s "$output_filestem.err") {
[1692]977 open (ERRLOG, "$output_filestem.err") || die "$!";
[2755]978 print STDERR "pdftohtml error log:\n";
[1692]979 while (<ERRLOG>) {
980 print STDERR "$_";
981 }
982 close ERRLOG;
983 }
[2656]984 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[2755]985 if (-e "$output_filestem.err") {
986 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
987 {
988 open (ERRLOG, "$output_filestem.err");
989 while (<ERRLOG>) {print FAILLOG $_;}
990 close ERRLOG;
991 close FAILLOG;
992 }
[10282]993 &util::rm("$output_filestem.err");
[2755]994 }
[1692]995 return 0;
996 }
[10357]997
998 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
999 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1000 return 1;
1001}
1002
1003# Convert a pdf file to various types of image with the convert command
1004
[17329]1005sub pdfps_to_img {
[10357]1006 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[10401]1007
1008 # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
1009 if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
1010 my $result = `identify 2>&1`;
1011 if ($? == -1 || $? == 256) { # Linux and Windows return different values for "program not found"
1012 #ImageMagick is not installed, thus the convert utility is not available.
[17329]1013 print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
[10401]1014 return 0;
1015 }
1016 }
1017
[10357]1018 $cmd = "";
1019 if ($timeout) {$cmd = "ulimit -t $timeout;";}
1020 $output_type =~ s/.*\_(.*)/$1/i;
[17329]1021 $cmd .= "perl -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
[16435]1022 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
[10357]1023 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1024 } else {
1025 $cmd .= " > \"$output_filestem.err\"";
1026 }
1027
1028 # don't include path on windows (to avoid having to play about
1029 # with quoting when GSDLHOME might contain spaces) but assume
1030 # that the PATH is set up correctly
1031 $!=0;
1032 my $retval=system($cmd);
1033 if ($retval!=0)
1034 {
[10401]1035 print STDERR "Error executing pdftoimg.pl";
[10357]1036 if ($!) {print STDERR ": $!";}
1037 print STDERR "\n";
1038 }
1039
1040 #make sure the converter made something
1041 #if ($retval !=0) || ! -s "$output_filestem")
1042 if ($retval !=0)
1043 {
1044 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1045 #print out the converter's std err, if any
1046 if (-s "$output_filestem.err") {
1047 open (ERRLOG, "$output_filestem.err") || die "$!";
[17329]1048 print STDERR "pdfpstoimg error log:\n";
[10357]1049 while (<ERRLOG>) {
1050 print STDERR "$_";
1051 }
1052 close ERRLOG;
1053 }
[10534]1054 #&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[10357]1055 if (-e "$output_filestem.err") {
1056 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1057 {
1058 open (ERRLOG, "$output_filestem.err");
1059 while (<ERRLOG>) {print FAILLOG $_;}
1060 close ERRLOG;
1061 close FAILLOG;
1062 }
1063 &util::rm("$output_filestem.err");
1064 }
1065 return 0;
1066 }
[2656]1067 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[1445]1068 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1069 return 1;
1070}
1071
1072# Convert a PDF file to text with the pdftotext command
1073
1074sub pdf_to_text {
[2755]1075 my ($dirname, $input_filename, $output_filestem) = @_;
[1445]1076
[2248]1077 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
[2755]1078
[16435]1079 if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
[2755]1080 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1081 } else {
1082 $cmd .= " > \"$output_filestem.err\"";
1083 }
[1445]1084
[2060]1085 if (system($cmd)!=0)
[1445]1086 {
1087 print STDERR "Error executing $cmd: $!\n";
1088 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1089 }
1090
[2755]1091 # make sure there is some extracted text.
1092 if (-e "$output_filestem.text") {
1093 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
1094 binmode(EXTR_TEXT); # just in case...
1095 my $line="";
1096 my $seen_text=0;
1097 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
[16435]1098 if ($line=~ m/\w/) {$seen_text=1;}
[2755]1099 }
1100 close EXTR_TEXT;
1101 if ($seen_text==0) { # no text was extracted
1102 print STDERR "Error: pdftotext found no text\n";
1103 &util::rm("$output_filestem.text");
1104 }
1105 }
1106
[1692]1107 # make sure the converter made something
[2656]1108 if (! -s "$output_filestem.text")
[1692]1109 {
1110 # print out the converters std err, if any
[2656]1111 if (-s "$output_filestem.err") {
[1692]1112 open (ERRLOG, "$output_filestem.err") || die "$!";
[2755]1113 print STDERR "pdftotext error log:\n";
[1692]1114 while (<ERRLOG>) {
1115 print STDERR "$_";
1116 }
1117 close ERRLOG;
1118 }
[2656]1119 # does this converter create a .out file?
1120 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1121 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]1122 if (-e "$output_filestem.err") {
1123 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1124 {
1125 open (ERRLOG,"$output_filestem.err");
1126 while (<ERRLOG>) {print FAILLOG $_;}
1127 close ERRLOG;
1128 close FAILLOG;
1129 }
1130 &util::rm("$output_filestem.err");
1131 }
[1692]1132 return 0;
1133 }
[1445]1134 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1135 return 1;
1136}
1137
[2012]1138# Convert a PostScript document to text
1139# note - just using "ps2ascii" isn't good enough, as it
1140# returns 0 for a postscript interpreter error. ps2ascii is just
1141# a wrapper to "gs" anyway, so we use that cmd here.
[1445]1142
1143sub ps_to_text {
[2241]1144 my ($input_filename, $output_filestem) = @_;
[1445]1145
[2241]1146 my $error = "";
1147
1148 # if we're on windows we'll fall straight through without attempting
1149 # to use gs
[16435]1150 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
[2241]1151 $error = "Windows does not support gs";
1152
1153 } else {
[3538]1154 my $cmd = "";
1155 if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1156 $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
[2241]1157 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
[10357]1158 #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
[2241]1159 $cmd .= " 2> $output_filestem.err";
1160 $!=0;
[10357]1161
[2241]1162 my $retcode=system($cmd);
1163 $retcode = $? >> 8; # see man perlfunc - system for this...
1164 # if system returns -1 | 127 (couldn't start program), look at $! for message
1165
1166 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1167 elsif (! -e "$output_filestem.text") {
1168 $error="did not create output file.\n";
[2012]1169 }
[2241]1170 else
1171 { # make sure the interpreter didn't get an error. It is technically
1172 # possible for the actual text to start with this, but....
1173 open PSOUT, "$output_filestem.text";
[16435]1174 if (<PSOUT> =~ m/^Error: (.*)/) {
[2241]1175 $error="interpreter error - \"$1\"";
1176 }
1177 close PSOUT;
1178 }
[2012]1179 }
[2241]1180
[2012]1181 if ($error ne "")
[1445]1182 {
[2755]1183 print STDERR "Warning: Error executing gs: $error\n";
[1445]1184 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]1185
1186 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1187 {
1188 print FAILLOG "gs - $error\n";
1189 if (-e "$output_filestem.err") {
1190 open(ERRLOG, "$output_filestem.err");
1191 while (<ERRLOG>) {print FAILLOG $_;}
1192 close ERRLOG;
1193 }
1194 close FAILLOG;
1195 }
[1445]1196 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[2012]1197
[2755]1198
[2012]1199 # Fine then. We'll just do a lousy job by ourselves...
[2031]1200 # Based on 5-line regexp sed script found at:
[2012]1201 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1202 #
[2755]1203 print STDERR "Stripping text from postscript\n";
[2012]1204 my $errorcode=0;
1205 open (IN, "$input_filename")
1206 || ($errorcode=1, warn "Couldn't read file: $!");
1207 open (OUT, ">$output_filestem.text")
1208 || ($errorcode=1, warn "Couldn't write file: $!");
1209 if ($errorcode) {print STDERR "errors\n";return 0;}
1210
[2031]1211 my $text=""; # this is for whole .ps file...
[2755]1212 $text = join('', <IN>); # see man perlport, under "System Resources"
[2031]1213 close IN;
1214
[2447]1215 # Make sure this is a ps file...
[16435]1216 if ($text !~ m/^%!/) {
[2755]1217 print STDERR "Bad postscript header: not '%!'\n";
1218 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1219 {
1220 print FAILLOG "Bad postscript header: not '%!'\n";
1221 close FAILLOG;
1222 }
[2447]1223 return 0;
1224 }
1225
[2031]1226 # if ps has Page data, then use it to delete all stuff before it.
1227 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1228
1229 # remove all leading non-data stuff
1230 $text =~ s/^.*?\(//s;
1231
1232 # remove all newline chars for easier processing
1233 $text =~ s/\n//g;
1234
1235 # Big assumption here - assume that if any co-ordinates are
1236 # given, then we are at the end of a sentence.
1237 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1238
1239 # special characters--
1240 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1241
1242 # ? ps text formatting (eg italics?) ?
1243 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1244 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1245 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1246 # default - remove the rest
1247 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1248
1249 # attempt to add whitespace between words...
1250 # this is based purely on observation, and may be completely wrong...
1251 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1252 # eg I notice "b(" is sometimes NOT a space if preceded by a
1253 # negative number.
1254 $text =~ s/\)\d+ ?b\(/\) \( /g;
1255
1256 # change quoted braces to brackets
1257 $text =~ s/([^\\])\\\(/$1\{/g;
1258 $text =~ s/([^\\])\\\)/$1\}/g ;
1259
1260 # remove everything that is not between braces
1261 $text =~ s/\)([^\(\)])+?\(//sg ;
1262
1263 # remove any Trailer eof stuff.
1264 $text =~ s/\)[^\)]*$//sg;
1265
1266 ### ligatures have special characters...
1267 $text =~ s/\\013/ff/g;
1268 $text =~ s/\\014/fi/g;
1269 $text =~ s/\\015/fl/g;
1270 $text =~ s/\\016/ffi/g;
1271 $text =~ s/\\214/fi/g;
1272 $text =~ s/\\215/fl/g;
1273 $text =~ s/\\017/\n\* /g; # asterisk?
1274 $text =~ s/\\023/\023/g; # e acute ('e)
1275 $text =~ s/\\177/\252/g; # u"
1276# $text =~ s/ ?? /\344/g; # a"
1277
1278 print OUT "$text";
1279 close OUT;
[1960]1280 }
[2600]1281 # wrap the text - use a minimum length. ie, first space after this length.
1282 my $wrap_length=72;
1283 &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1284 open INFILE, "$output_filestem.text.tmp" ||
1285 die "Couldn't open file: $!";
1286 open OUTFILE, ">$output_filestem.text" ||
1287 die "Couldn't open file for writing: $!";
1288 my $line="";
1289 while ($line=<INFILE>) {
1290 while (length($line)>0) {
1291 if (length($line)>$wrap_length) {
1292 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1293 print OUTFILE "$1\n";
1294 } else {
1295 print OUTFILE "$line";
1296 $line="";
1297 }
1298 }
1299 }
1300 close INFILE;
1301 close OUTFILE;
1302 &util::rm("$output_filestem.text.tmp");
1303
[1445]1304 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1305 return 1;
1306}
1307
1308
1309# Convert any file to HTML with a crude perl implementation of the
1310# UNIX strings command.
1311
1312sub any_to_html {
1313 ($input_filename, $output_filestem) = @_;
1314
1315 # First generate a text file
1316 return 0 unless (&any_to_text($input_filename, $output_filestem));
1317
1318 # create an HTML file from the text file
1319 open(TEXT, "<$output_filestem.text");
1320 open(HTML, ">$output_filestem.html");
1321
[2241]1322 print HTML "<html><head>\n";
1323 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1324 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1325 print HTML "</head><body>\n\n";
[1734]1326
[2755]1327 my $line;
1328 while ($line=<TEXT>) {
1329 $line =~ s/</&lt;/g;
1330 $line =~ s/>/&gt;/g;
[16435]1331 if ($line =~ m/^\s*$/) {
[2755]1332 print HTML "<p>";
1333 } else {
1334 print HTML "<br> ", $line;
1335 }
[1445]1336 }
[1734]1337 print HTML "\n</body></html>\n";
[1445]1338
[2241]1339 close HTML;
1340 close TEXT;
1341
[1445]1342 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1343 return 1;
1344}
1345
1346# Convert any file to TEXT with a crude perl implementation of the
1347# UNIX strings command.
[2755]1348# Note - this assumes ascii charsets :( (jrm21)
[1445]1349
1350sub any_to_text {
1351 ($input_filename, $output_filestem) = @_;
1352
[3350]1353 if (!$use_strings) {
1354 return 0;
1355 }
[15120]1356
1357 print STDERR "\n**** In any to text****\n\n";
[2755]1358 open(IN, "<$input_filename") || return 0;
[1734]1359 binmode(IN);
[2755]1360 open(OUT, ">$output_filestem.text") || return 0;
[1445]1361
1362 my ($line);
[2755]1363 my $output_line_count = 0;
[1445]1364 while (<IN>) {
1365 $line = $_;
[1734]1366
[1445]1367 # delete anything that isn't a printable character
1368 $line =~ s/[^\040-\176]+/\n/sg;
1369
1370 # delete any string less than 10 characters long
[1734]1371 $line =~ s/^.{0,9}$/\n/mg;
[16435]1372 while ($line =~ m/^.{1,9}$/m) {
[1734]1373 $line =~ s/^.{0,9}$/\n/mg;
[1445]1374 $line =~ s/\n+/\n/sg;
1375 }
1376
1377 # remove extraneous whitespace
1378 $line =~ s/\n+/\n/gs;
1379 $line =~ s/^\n//gs;
[1578]1380
[1445]1381 # output whatever is left
[16435]1382 if ($line =~ m/[^\n ]/) {
[1445]1383 print OUT $line;
[2755]1384 ++$output_line_count;
[1445]1385 }
1386 }
[2241]1387
1388 close OUT;
1389 close IN;
1390
[2755]1391 if ($output_line_count) { # try to protect against binary only formats
1392 return 1;
1393 }
1394
1395 &util::rm("$output_filestem.text");
1396 return 0;
1397
[1445]1398}
Note: See TracBrowser for help on using the repository browser.