source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 22446

Last change on this file since 22446 was 22429, checked in by davidb, 14 years ago

Support of using OpenOffice scripting through JODConverter.jar added. Also added in 'use strict' and then fixed up a variety of places that needed 'my' added

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 48.3 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use strict;
52
53use parsargv;
54use util;
55use Cwd;
56use File::Basename;
57
58# Are we running on WinNT or Win2000 (or later)?
59my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62my $use_strings;
63my $pdf_complex;
64my $pdf_nohidden;
65my $pdf_zoom;
66my $pdf_ignore_images;
67my $pdf_allow_images_only;
68my $windows_scripting;
69my $openoffice_scripting;
70
71sub print_usage
72{
73 print STDERR "\n";
74 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
75 print STDERR " or text using third-party programs.\n\n";
76 print STDERR " usage: $0 [options] filename\n";
77 if ($openoffice_scripting) {
78 print STDERR " options:\n\t-type\tdoc|dot|docx|odf|pdf|ps|ppt|rtf|xls\t(input file type)\n";
79 }
80 else {
81 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
82 }
83 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
84 print STDERR "\t-output\tauto|html|text|pagedimage_jpg|pagedimage_gif|pagedimage_png\t(output file type)\n";
85 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
86 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
87 print STDERR "\t-windows_scripting\tuse windows script (if available) when converting Microsoft Word and PPT via VB script\n";
88 print STDERR "\t-openoffice_scripting\tuse openoffice script (if available) when converting Microsoft Word and PPT via OpenOffice\n";
89 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
90 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
91 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
92 print STDERR "\t\tconverting PDF to HTML\n";
93 print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
94 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
95 print STDERR "\t\t-pdf_complex is set\n";
96 exit(1);
97}
98
99my $faillogfile="";
100my $timeout=0;
101
102sub main
103{
104 my (@ARGV) = @_;
105 my ($input_type,$output_type,$verbose);
106
107
108 # scan for -openoffice_scripting as it effects the permissible
109 # values for -type
110
111 foreach my $a (@ARGV) {
112 if ($a =~ m/^-openoffice_scripting$/) {
113 $openoffice_scripting = 1;
114 last;
115 }
116 }
117
118 my $parse_type;
119 if ($openoffice_scripting) {
120 $parse_type = 'type/(doc|dot|docx|odf|pdf|ps|ppt|rtf|xls)/';
121 }
122 else {
123 $parse_type = 'type/(doc|dot|pdf|ps|ppt|rtf|xls)/';
124 }
125
126 # read command-line arguments
127 if (!parsargv::parse(\@ARGV,
128 $parse_type, \$input_type,
129 '/errlog/.*/', \$faillogfile,
130 'output/(auto|html|text|pagedimage).*/', \$output_type,
131 'timeout/\d+/0',\$timeout,
132 'verbose/\d+/0', \$verbose,
133 'windows_scripting',\$windows_scripting,
134 'openoffice_scripting',\$openoffice_scripting,
135 'use_strings', \$use_strings,
136 'pdf_complex', \$pdf_complex,
137 'pdf_ignore_images', \$pdf_ignore_images,
138 'pdf_allow_images_only', \$pdf_allow_images_only,
139 'pdf_nohidden', \$pdf_nohidden,
140 'pdf_zoom/\d+/2', \$pdf_zoom
141 ))
142 {
143 print_usage();
144 }
145
146 # Make sure the input file exists and can be opened for reading
147 if (scalar(@ARGV!=1)) {
148 print_usage();
149 }
150
151 my $input_filename = $ARGV[0];
152 if (!-r $input_filename) {
153 print STDERR "Error: unable to open $input_filename for reading\n";
154 exit(1);
155 }
156
157 # Deduce filenames
158 my ($tailname,$dirname,$suffix)
159 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
160 my $output_filestem = &util::filename_cat($dirname, "$tailname");
161
162 if ($input_type eq "")
163 {
164 $input_type = lc (substr($suffix,1,length($suffix)-1));
165 }
166
167 # Change to temporary working directory
168 my $stored_dir = cwd();
169 chdir ($dirname) || die "Unable to change to directory $dirname";
170
171 # Select convert utility
172 if (!defined $input_type) {
173 print STDERR "Error: No filename extension or input type defined\n";
174 exit(1);
175 }
176 elsif ($openoffice_scripting && (($input_type eq "docx") || ($input_type eq "odf"))) {
177 print &convertDOC($input_filename, $output_filestem, $output_type);
178 print "\n";
179 }
180 elsif ($input_type eq "doc" || $input_type eq "dot") {
181 print &convertDOC($input_filename, $output_filestem, $output_type);
182 print "\n";
183 }
184 elsif ($input_type eq "rtf") {
185 print &convertRTF($input_filename, $output_filestem, $output_type);
186 print "\n";
187 }
188 elsif ($input_type eq "pdf") {
189 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
190 print "\n";
191 }
192 elsif ($input_type eq "ps") {
193 print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
194 print "\n";
195 }
196 elsif ($input_type eq "ppt") {
197 print &convertPPT($input_filename, $output_filestem, $output_type);
198 print "\n";
199 }
200 elsif ($input_type eq "xls") {
201 print &convertXLS($input_filename, $output_filestem, $output_type);
202 print "\n";
203 }
204 else {
205 print STDERR "Error: Unable to convert type '$input_type'\n";
206 exit(1);
207 }
208
209 # restore to original working directory
210 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
211
212}
213
214&main(@ARGV);
215
216
217
218# Document-type conversion functions
219#
220# The following functions attempt to convert documents from their
221# input type to the specified output type. If no output type was
222# given, then they first attempt HTML, and then TEXT.
223#
224# Each returns the output type ("html" or "text") or "fail" if no
225# conversion is possible.
226
227# Convert a Microsoft word document
228
229sub convertDOC {
230 my ($input_filename, $output_filestem, $output_type) = @_;
231
232 if (($openoffice_scripting) && ($input_filename =~ m/\.docx?$/i)) {
233 # Jump right in and process with Open Office
234 if (openoffice_doc_to_html($input_filename, $output_filestem)) {
235 return "html";
236 }
237 else {
238 return "fail";
239 }
240 }
241
242 # Many .doc files are not in fact word documents!
243 my $realtype = &find_docfile_type($input_filename);
244
245 if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
246 return &convertWord678($input_filename, $output_filestem, $output_type);
247 } elsif ($realtype eq "rtf") {
248 return &convertRTF($input_filename, $output_filestem, $output_type);
249 } else {
250 return &convertAnything($input_filename, $output_filestem, $output_type);
251 }
252}
253
254# Convert a Microsoft word 6/7/8 document
255
256sub convertWord678 {
257 my ($input_filename, $output_filestem, $output_type) = @_;
258
259 my $success = 0;
260 if (!$output_type || ($output_type =~ m/html/i)){
261 if ($windows_scripting) {
262 $success = &native_doc_to_html($input_filename, $output_filestem);
263 }
264 elsif ($openoffice_scripting) {
265 $success = &openoffice_doc_to_html($input_filename, $output_filestem);
266 }
267 else {
268 $success = &doc_to_html($input_filename, $output_filestem);
269 }
270 if ($success) {
271 return "html";
272 }
273 }
274 return &convertAnything($input_filename, $output_filestem, $output_type);
275}
276
277
278# Convert a Rich Text Format (RTF) file
279
280sub convertRTF {
281 my ($input_filename, $output_filestem, $output_type) = @_;
282
283 my $success = 0;
284
285 # Attempt specialised conversion to HTML
286 if (!$output_type || ($output_type =~ m/html/i)) {
287
288 if ($windows_scripting) {
289 $success = &native_doc_to_html($input_filename, $output_filestem);
290 }
291 elsif ($openoffice_scripting) {
292 $success = &openoffice_doc_to_html($input_filename, $output_filestem);
293 }
294 else {
295 $success = &rtf_to_html($input_filename, $output_filestem);
296 }
297 if ($success) {
298 return "html";
299 }
300 }
301
302# rtf is so ugly that's it's not worth running strings over.
303# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
304# return &convertAnything($input_filename, $output_filestem, $output_type);
305 return "fail";
306}
307
308
309# Convert an unidentified file
310
311sub convertAnything {
312 my ($input_filename, $output_filestem, $output_type) = @_;
313
314 my $success = 0;
315
316 # Attempt simple conversion to HTML
317 if (!$output_type || ($output_type =~ m/html/i)) {
318 $success = &any_to_html($input_filename, $output_filestem);
319 if ($success) {
320 return "html";
321 }
322 }
323
324 # Convert to text
325 if (!$output_type || ($output_type =~ m/text/i)) {
326 $success = &any_to_text($input_filename, $output_filestem);
327 if ($success) {
328 return "text";
329 }
330 }
331 return "fail";
332}
333
334
335
336# Convert an Adobe PDF document
337
338sub convertPDF {
339 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
340
341 my $success = 0;
342 $output_type =~ s/.*\-(.*)/$1/i;
343 # Attempt coversion to Image
344 if ($output_type =~ m/jp?g|gif|png/i) {
345 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
346 if ($success){
347 return "item";
348 }
349 }
350
351 # Attempt conversion to HTML
352 if (!$output_type || ($output_type =~ m/html/i)) {
353 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
354 if ($success) {
355 return "html";
356 }
357 }
358
359 # Attempt conversion to TEXT
360 if (!$output_type || ($output_type =~ m/text/i)) {
361 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
362 if ($success) {
363 return "text";
364 }
365 }
366
367 return "fail";
368
369}
370
371
372# Convert an Adobe PostScript document
373
374sub convertPS {
375 my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
376
377 my $success = 0;
378 $output_type =~ s/.*\-(.*)/$1/i;
379 # Attempt coversion to Image
380 if ($output_type =~ m/jp?g|gif|png/i) {
381 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
382 if ($success){
383 return "item";
384 }
385 }
386
387 # Attempt conversion to TEXT
388 if (!$output_type || ($output_type =~ m/text/i)) {
389 $success = &ps_to_text($input_filename, $output_filestem);
390 if ($success) {
391 return "text";
392 }
393 }
394 return "fail";
395}
396
397
398sub convertPPT {
399 my ($input_filename, $output_filestem, $output_type) = @_;
400 my $success = 0;
401
402 my $ppt_convert_type = "";
403 #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
404 if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
405 if ($output_type =~ m/gif/i) {
406 $ppt_convert_type = "-g";
407 } elsif ($output_type =~ m/jp?g/i){
408 $ppt_convert_type = "-j";
409 } elsif ($output_type =~ m/png/i){
410 $ppt_convert_type = "-p";
411 }
412 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
413 $ENV{'GSDLOS'}, "pptextract");
414 $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
415
416 my $cmd = "";
417 if ($timeout) {$cmd = "ulimit -t $timeout;";}
418 # if the converting directory already exists
419 if (-d $output_filestem) {
420 print STDERR "**The conversion directory already exists\n";
421 return "item";
422 } else {
423 $cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
424 $cmd .= " 2>\"$output_filestem.err\""
425 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
426 if (system($cmd) !=0) {
427 print STDERR "Powerpoint VB Scripting convert failed\n";
428 } else {
429 return "item";
430 }
431 }
432 } elsif (!$output_type || ($output_type =~ m/html/i)) {
433 # Attempt conversion to HTML
434 #if (!$output_type || ($output_type =~ m/html/i)) {
435 # formulate the command
436 my $cmd = "";
437 $cmd .= "perl -S ppttohtml.pl ";
438 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
439 $cmd .= " 2>\"$output_filestem.err\""
440 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
441
442 # execute the command
443 $!=0;
444 if (system($cmd)!=0)
445 {
446 print STDERR "Powerpoint 95/97 converter failed $!\n";
447 } else {
448 return "html";
449 }
450 }
451
452 $success = &any_to_text($input_filename, $output_filestem);
453 if ($success) {
454 return "text";
455 }
456
457 return "fail";
458}
459
460
461sub convertXLS {
462 my ($input_filename, $output_filestem, $output_type) = @_;
463
464 my $success = 0;
465
466 # Attempt conversion to HTML
467 if (!$output_type || ($output_type =~ m/html/i)) {
468 # formulate the command
469 my $cmd = "";
470 $cmd .= "perl -S xlstohtml.pl ";
471 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
472 $cmd .= " 2>\"$output_filestem.err\""
473 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
474
475
476 # execute the command
477 $!=0;
478 if (system($cmd)!=0)
479 {
480 print STDERR "Excel 95/97 converter failed $!\n";
481 } else {
482 return "html";
483 }
484 }
485
486 $success = &any_to_text($input_filename, $output_filestem);
487 if ($success) {
488 return "text";
489 }
490
491 return "fail";
492}
493
494
495
496# Find the real type of a .doc file
497#
498# We seem to have a lot of files with a .doc extension that are .rtf
499# files or Word 5 files. This function attempts to tell the difference.
500sub find_docfile_type {
501 my ($input_filename) = @_;
502
503 open(CHK, "<$input_filename");
504 binmode(CHK);
505 my $line = "";
506 my $first = 1;
507
508 while (<CHK>) {
509
510 $line = $_;
511
512 if ($first) {
513 # check to see if this is an rtf file
514 if ($line =~ m/^\{\\rtf/) {
515 close(CHK);
516 return "rtf";
517 }
518 $first = 0;
519 }
520
521 # is this is a word 6/7/8 document?
522 if ($line =~ m/Word\.Document\.([678])/) {
523 close(CHK);
524 return "word$1";
525 }
526
527 }
528
529 return "unknown";
530}
531
532
533# Specific type-to-type conversions
534#
535# Each of the following functions attempts to convert a document from
536# a specific format to another. If they succeed they return 1 and leave
537# the output document(s) in the appropriate place; if they fail they
538# return 0 and delete any working files.
539
540
541# Attempt to convert a word document to html with the wv program
542sub doc_to_html {
543 my ($input_filename, $output_filestem) = @_;
544
545 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
546
547 if ( -d "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv" && $ENV{'GSDLOS'} eq "linux" ) {
548 $ENV{'PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/bin:$ENV{'PATH'}";
549 $ENV{'LD_LIBRARY_PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/lib:$ENV{'LD_LIBRARY_PATH'}";
550 $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv", "bin", "wvWare");
551 }
552
553 # don't include path on windows (to avoid having to play about
554 # with quoting when GSDLHOME might contain spaces) but assume
555 # that the PATH is set up correctly
556 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
557
558 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
559 "packages", "wv", "wvHtml.xml");
560
561 # Added the following to work with replace_srcdoc_with_html.pl:
562 # Make wvWare put any associated (image) files of the word doc into
563 # folder docname-without-extention_files. This folder should be at
564 # the same level as the html file generated from the doc.
565 # wvWare will take care of proper interlinking.
566
567 # This step is necessary for replace_srcdoc_with_html.pl which will
568 # move the html and associated files into the import folder. We
569 # want to ensure that the associated files won't overwrite similarly
570 # named items already in import. Hence we put them in a folder first
571 # (to which the html links properly) and that will allow
572 # replace_srcdoc_with_html.pl to move them safely to /import.
573
574 # To do all this, we need to use wvWare's --dir and --basename options
575 # where dir is the full path to the image folder directory and
576 # basename is the full path to the image folder appended to the name
577 # which is to be prepended to every image file:
578 # eg. if the images were to have names like sample0.jpg to sampleN.jpg,
579 # then the basename is "/full/path/to/imgdir/sample".
580 # In this case, basename is the full path to and name of the document.
581 # HOWEVER: basename always takes full path, not relative url, so
582 # the greenstone browser is unable to display the images (absolute paths
583 # cause it to give an "external link" message)
584 # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
585 # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
586 # "added --dir option to wvHtml so that pictures can be placed in
587 # a seperate directory"
588 # "running wvWare through IMP to view word documents as html. It gets
589 # invoked like this:
590 # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
591
592 # toppath is the folder where html is generated
593 # docname is the name (without extension) of the html to be generated
594 # suffix (extension) is thrown away
595 my ($docname, $toppath)
596 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
597
598 # We want the image folder generated to have the same name as windows
599 # would generate ($windows_scripting) when it converts from word to html.
600 # That is, foldername=docname_files
601 my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
602 #print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
603
604 # ensure this image directory exists
605 # if it exists already, just delete and recreate
606 if(-e $assoc_dir) {
607 &util::rm_r($assoc_dir);
608 }
609 &util::mk_dir($assoc_dir);
610
611 # the images are all going to be called image0, image1,..., imageN
612 my $img_basenames = &util::filename_cat($assoc_dir, $docname);
613
614 #print STDERR "****toppath: $toppath\n****docname: $docname\n;
615 #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
616 #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
617
618 my $cmd = "";
619 if ($timeout) {$cmd = "ulimit -t $timeout;";}
620 # wvWare's --dir and --basename options for image directory.
621 # Replaced the next line with the *2 lines* following it:
622 # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
623 $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
624 $cmd .= " --charset utf-8 --config \"$wv_conf\"";
625 $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
626
627 # redirecting STDERR is a bad idea on windows 95/98
628 $cmd .= " 2> \"$output_filestem.err\""
629 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
630 # execute the command
631 $!=0;
632 if (system($cmd)!=0)
633 {
634 print STDERR "Error executing wv converter:$!\n";
635 if (-s "$output_filestem.err") {
636 open (ERRFILE, "<$output_filestem.err");
637
638 my $write_to_fail_log=0;
639 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
640 {$write_to_fail_log=1;}
641
642 my $line;
643 while ($line=<ERRFILE>) {
644 if ($line =~ m/\w/) {
645 print STDERR "$line";
646 print FAILLOG "$line" if ($write_to_fail_log);
647 }
648 if ($line !~ m/startup error/) {next;}
649 print STDERR " (given an invalid .DOC file?)\n";
650 print FAILLOG " (given an invalid .DOC file?)\n"
651 if ($write_to_fail_log);
652
653 } # while ERRFILE
654 close FAILLOG if ($write_to_fail_log);
655 }
656 return 0; # we can try any_to_text
657 }
658
659 # Was the conversion successful?
660
661 if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
662 open(TMP, "$output_filestem.html");
663 my $line = <TMP>;
664 close(TMP);
665 if ($line && $line =~ m/DOCTYPE HTML/) {
666 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
667
668 # Inserted this code to remove the images directory if it was still empty after
669 # the html was generated (in case there were no images in the word document)
670 if (&util::is_dir_empty($assoc_dir)) {
671 #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n";
672 &util::rm_r($assoc_dir);
673 } else { # there was an image folder (it was generated)
674 # Therefore, the html file generated contains absolute links to the images
675 # Replace them with relative links instead, so the folder can be moved elsewhere
676 &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
677 }
678 return 1;
679 }
680 }
681
682 # If here, an error of some sort occurred
683 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
684 if (-e "$output_filestem.err") {
685 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
686 open (ERRLOG,"$output_filestem.err");
687 while (<ERRLOG>) {print FAILLOG $_;}
688 close FAILLOG;
689 close ERRLOG;
690 }
691 &util::rm("$output_filestem.err");
692 }
693
694 return 0;
695}
696
697# Method to work with doc_to_html - Word docs might contain images.
698# When such word docs are converted with wvWare, we make it generate a
699# <filename>_files folder with the associated images, while the html file
700# <filename> refers to the images using absolute paths to <filename>_files.
701# This method reads in that html file and replaces all the absolute paths to
702# the images in <filename>_files with the relative paths to the images from
703# that folder. (I.e. with <filename>_files/<imagename.ext>).
704sub make_links_to_assocdir_relative{
705 # toppath is the top-level folder in which the html file we're going to be fixing resides
706 # docname is just the name (without extension) of the html file
707 # html_file is the full path to the html file: /full/path/docname.html
708 # assoc_dir_path is toppath/docname_files
709 # assoc_dirname is the directory name of the folder with associated imgs: docname_files
710 my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
711
712 # 1. Read all the contents of the html into a string
713 # open the original file for reading
714 unless(open(FIN, "<$html_file")) {
715 print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
716 return 0;
717 }
718 # From http://perl.plover.com/local.html
719 # "It's cheaper to read the file all at once, without all the splitting and reassembling.
720 # (Some people call this slurping the file.) Perl has a special feature to support this:
721 # If the $/ variable is undefined, the <...> operator will read the entire file all at once"
722 my $html_contents;
723 {
724 local $/ = undef; # Read entire file at once
725 $html_contents = <FIN>; # Now file is read in as one single 'line'
726 }
727 close(FIN); # close the file
728 #print STDERR $html_contents;
729
730 # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src
731 # values with assoc_dirname
732 # At the end: g means substitute all occurrences (global), while s at the end means treat
733 # all new lines as a regular space. This interacts with g to consider all the lines
734 # together as a single line so that multi-occurrences can be replaced.
735
736 # we can't just replace $assoc_dir_path with $assoc_dir
737 # $assoc_dir_path represents a regular expression that needs to be replaced
738 # if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
739 # meaning in Perl regular expressions -- we need to escape these first
740 my $safe_reg_expression = $assoc_dir_path;
741 $safe_reg_expression =~ s/\\/\\\\/g;
742 $safe_reg_expression =~ s/\./\\./g;
743 $safe_reg_expression =~ s/\-/\\-/g;
744 $safe_reg_expression =~ s/\[/\\[/g;
745 $safe_reg_expression =~ s/\]/\\]/g;
746 $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
747
748 # The following regular expression substitution looks for <a or <image, followed by any other
749 # attributes and values until it comes to the FIRST (indicated by ?) href= or src=
750 # followed by " or ' no quotes at all around path, followed by the associated folder's pathname
751 # followed by characters (for the img filename), then finally the optional closing quotes
752 # in " or ' form, followed by any other attributes and values until the first > to end the tag.
753 # The substitution: all the parts preceding associated folder's pathname are retained,
754 # the associated folder path name is replaced by associated folder directory name
755 # and the rest upto and including the closing > tag is retained.
756 # The sg at the end of the pattern match treats all of html_contents as a single line (s)
757 # and performs a global replace (g) meaning that all occurrences that match in that single line
758 # are substituted.
759 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg;
760 #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
761 # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
762 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
763
764 #print STDERR "****assoc_dirname: $assoc_dirname***\n";
765 #print STDERR "****safe_reg_expression: $safe_reg_expression***\n";
766
767 # delete the original file and recreate it
768 my $copy_of_filename = $html_file;
769 &util::rm($copy_of_filename); # deleted the file
770
771 # Recreate the original file for writing the updated contents
772 unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
773 print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
774 return 0;
775 }
776
777 # write out the updated contents and close the file
778 print FOUT $html_contents;
779 close(FOUT);
780 return 1;
781}
782
783# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
784# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
785# introduced in link pathnames by wvWare into space again. Converts all percent signs
786# introduced by URL encoding filenames generated into %25 in these url links referencing them
787sub post_process_assocfile_urls
788{
789 my ($pre, $text, $post) = @_;
790
791 $text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
792 # $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
793 $text =~ s/\\/\//g;
794 $text =~ s/%/%25/g;
795
796 return "$pre$text$post";
797}
798
799# Attempt to convert a word document to html with the word2html scripting program
800sub native_doc_to_html {
801 my ($input_filename, $output_filestem) = @_;
802
803 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
804 $ENV{'GSDLOS'}, "word2html");
805
806 $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
807 if (-e "$output_filestem.html") {
808 print STDERR " The conversion file:\n";
809 print STDERR " $output_filestem.html\n";
810 print STDERR " ... already exists. Skipping\n";
811 return 1;
812 }
813
814 my $cmd = "";
815 if ($timeout) {$cmd = "ulimit -t $timeout;";}
816 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
817 #$cmd .= "$vbScript $input_filename $output_filestem.html";
818 $cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
819
820 # redirecting STDERR
821 $cmd .= " 2> \"$output_filestem.err\""
822 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
823
824 # execute the command
825 $!=0;
826 if (system($cmd)!=0)
827 {
828 print STDERR "Error executing word2Html converter:$!\n";
829 if (-s "$output_filestem.err") {
830 open (ERRFILE, "<$output_filestem.err");
831
832 my $write_to_fail_log=0;
833 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
834 {$write_to_fail_log=1;}
835
836 my $line;
837 while ($line=<ERRFILE>) {
838 if ($line =~ m/\w/) {
839 print STDERR "$line";
840 print FAILLOG "$line" if ($write_to_fail_log);
841 }
842 if ($line !~ m/startup error/) {next;}
843 print STDERR " (given an invalid .DOC file?)\n";
844 print FAILLOG " (given an invalid .DOC file?)\n"
845 if ($write_to_fail_log);
846
847 } # while ERRFILE
848 close FAILLOG if ($write_to_fail_log);
849 }
850 return 0; # we can try any_to_text
851 }
852
853 # Was the conversion successful?
854 if (-s "$output_filestem.html") {
855 open(TMP, "$output_filestem.html");
856 my $line = <TMP>;
857 close(TMP);
858 if ($line && $line =~ m/html/i) {
859 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
860 return 1;
861 }
862 }
863
864 # If here, an error of some sort occurred
865 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
866 if (-e "$output_filestem.err") {
867 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
868 open (ERRLOG,"$output_filestem.err");
869 while (<ERRLOG>) {print FAILLOG $_;}
870 close FAILLOG;
871 close ERRLOG;
872 }
873 &util::rm("$output_filestem.err");
874 }
875 return 0;
876}
877
878# Attempt to convert a word document to html with JODConvert scripting program
879sub openoffice_doc_to_html {
880 my ($input_filename, $output_filestem) = @_;
881
882 if (-e "$output_filestem.html") {
883 print STDERR " The conversion file:\n";
884 print STDERR " $output_filestem.html\n";
885 print STDERR " ... skipping\n";
886 return 1;
887 }
888
889 my $oo_script_dir = &util::filename_cat($ENV{'GEXT_OPENOFFICE'}, "bin", "script");
890 my $oo2html = &util::filename_cat($oo_script_dir,"oo2html");
891 if (!-e $oo2html) {
892 print STDERR "Error: Unable to find 'oo2html' in: \n";
893 print STDERR " $oo_script_dir\n";
894 print STDERR " Is the OpenOffice extension to Greenstone installed?\n";
895 return 0;
896 }
897
898 my $cmd = "";
899 if ($timeout) {$cmd = "ulimit -t $timeout;";}
900 $cmd .= "$oo2html \"$input_filename\" \"$output_filestem.html\"";
901
902 # redirecting STDERR
903 $cmd .= " 2> \"$output_filestem.err\""
904 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
905
906 # execute the command
907 $!=0;
908 if (system($cmd)!=0)
909 {
910 print STDERR "Error executing oo2html converter: $!\n";
911 print STDERR "Command was: $cmd\n";
912
913 if (-s "$output_filestem.err") {
914 open (ERRFILE, "<$output_filestem.err");
915
916 my $write_to_fail_log=0;
917 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
918 {$write_to_fail_log=1;}
919
920 my $line;
921 while ($line=<ERRFILE>) {
922 if ($line =~ m/\w/) {
923 print STDERR "$line";
924 print FAILLOG "$line" if ($write_to_fail_log);
925 }
926 if ($line !~ m/startup error/) {next;}
927 print STDERR " (given an invalid .DOC file?)\n";
928 print FAILLOG " (given an invalid .DOC file?)\n"
929 if ($write_to_fail_log);
930
931 } # while ERRFILE
932 close FAILLOG if ($write_to_fail_log);
933 }
934 return 0; # we can try any_to_text
935 }
936
937 # Was the conversion successful?
938 if (-s "$output_filestem.html") {
939 open(TMP, "$output_filestem.html");
940 my $line = <TMP>;
941 close(TMP);
942 if ($line && $line =~ m/html/i) {
943 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
944 return 1;
945 }
946 }
947
948 # If here, an error of some sort occurred
949
950 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
951 if (-e "$output_filestem.err") {
952 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
953 open (ERRLOG,"$output_filestem.err");
954 while (<ERRLOG>) {print FAILLOG $_;}
955 close FAILLOG;
956 close ERRLOG;
957 }
958 &util::rm("$output_filestem.err");
959 }
960 return 0;
961}
962
963# Attempt to convert an RTF document to html with rtftohtml
964sub rtf_to_html {
965 my ($input_filename, $output_filestem) = @_;
966
967 # formulate the command
968 my $cmd = "";
969 if ($timeout) {$cmd = "ulimit -t $timeout;";}
970 $cmd .= "rtftohtml";
971 #$cmd .= "rtf-converter";
972
973 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
974
975 $cmd .= " 2>\"$output_filestem.err\""
976 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
977
978
979 # execute the command
980 $!=0;
981 if (system($cmd)!=0)
982 {
983 print STDERR "Error executing rtf converter $!\n";
984 # don't currently bother printing out error log...
985 # keep going, in case it still created an HTML file...
986 }
987
988 # Was the conversion successful?
989 my $was_successful=0;
990 if (-s "$output_filestem.html") {
991 # make sure we have some content other than header
992 open (HTML, "$output_filestem.html"); # what to do if fail?
993 my $line;
994 my $past_header=0;
995 while ($line=<HTML>) {
996
997 if ($past_header == 0) {
998 if ($line =~ m/<body>/) {$past_header=1;}
999 next;
1000 }
1001
1002 $line =~ s/<[^>]+>//g;
1003 if ($line =~ m/\w/ && $past_header) { # we found some content...
1004 $was_successful=1;
1005 last;
1006 }
1007 }
1008 close HTML;
1009 }
1010
1011 if ($was_successful) {
1012 &util::rm("$output_filestem.err")
1013 if (-e "$output_filestem.err");
1014 # insert the (modified) table of contents, if it exists.
1015 if (-e "${output_filestem}_ToC.html") {
1016 &util::mv("$output_filestem.html","$output_filestem.src");
1017 my $open_failed=0;
1018 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
1019 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
1020 open HTML, ">$output_filestem.html" || ++$open_failed;
1021
1022 if ($open_failed) {
1023 close HTMLSRC;
1024 close TOC;
1025 close HTML;
1026 &util::mv("$output_filestem.src","$output_filestem.html");
1027 return 1;
1028 }
1029
1030 # print out header info from src html.
1031 while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
1032 print HTML "$_";
1033 }
1034
1035 # print out table of contents, making links relative
1036 <TOC>; <TOC>; # ignore first 2 lines
1037 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
1038 my $line;
1039 while ($line=<TOC>) {
1040 $line =~ s@</body></html>$@@i ; # only last line has this
1041 # make link relative
1042 $line =~ s@href=\"[^\#]+@href=\"@i;
1043 print HTML $line;
1044 }
1045 close TOC;
1046
1047 # rest of html src
1048 while (<HTMLSRC>) {
1049 print HTML $_;
1050 }
1051 close HTMLSRC;
1052 close HTML;
1053
1054 &util::rm("${output_filestem}_ToC.html");
1055 &util::rm("${output_filestem}.src");
1056 }
1057 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
1058 return 1; # success
1059 }
1060
1061 if (-e "$output_filestem.err") {
1062 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1063 {
1064 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
1065 #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
1066 print FAILLOG " (rtf file might be too recent):\n";
1067 open (ERRLOG, "$output_filestem.err");
1068 while (<ERRLOG>) {print FAILLOG $_;}
1069 close ERRLOG;
1070 close FAILLOG;
1071 }
1072 &util::rm("$output_filestem.err");
1073 }
1074
1075 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1076
1077 return 0;
1078}
1079
1080
1081# Convert a pdf file to html with the pdftohtml command
1082
1083sub pdf_to_html {
1084 my ($dirname, $input_filename, $output_filestem) = @_;
1085
1086 my $cmd = "";
1087 if ($timeout) {$cmd = "ulimit -t $timeout;";}
1088 $cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
1089 $cmd .= " -c" if ($pdf_complex);
1090 $cmd .= " -i" if ($pdf_ignore_images);
1091 $cmd .= " -a" if ($pdf_allow_images_only);
1092 $cmd .= " -hidden" unless ($pdf_nohidden);
1093 $cmd .= " \"$input_filename\" \"$output_filestem\"";
1094
1095 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
1096 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1097 } else {
1098 $cmd .= " > \"$output_filestem.err\"";
1099 }
1100
1101 $!=0;
1102
1103 my $retval=system($cmd);
1104 if ($retval!=0)
1105 {
1106 print STDERR "Error executing pdftohtml.pl";
1107 if ($!) {print STDERR ": $!";}
1108 print STDERR "\n";
1109 }
1110
1111 # make sure the converter made something
1112 if ($retval!=0 || ! -s "$output_filestem.html")
1113 {
1114 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1115 # print out the converter's std err, if any
1116 if (-s "$output_filestem.err") {
1117 open (ERRLOG, "$output_filestem.err") || die "$!";
1118 print STDERR "pdftohtml error log:\n";
1119 while (<ERRLOG>) {
1120 print STDERR "$_";
1121 }
1122 close ERRLOG;
1123 }
1124 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1125 if (-e "$output_filestem.err") {
1126 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1127 {
1128 open (ERRLOG, "$output_filestem.err");
1129 while (<ERRLOG>) {print FAILLOG $_;}
1130 close ERRLOG;
1131 close FAILLOG;
1132 }
1133 &util::rm("$output_filestem.err");
1134 }
1135 return 0;
1136 }
1137
1138 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1139 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1140 return 1;
1141}
1142
1143# Convert a pdf file to various types of image with the convert command
1144
1145sub pdfps_to_img {
1146 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
1147
1148 # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
1149 if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
1150 my $result = `identify 2>&1`;
1151 if ($? == -1 || $? == 256) { # Linux and Windows return different values for "program not found"
1152 #ImageMagick is not installed, thus the convert utility is not available.
1153 print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
1154 return 0;
1155 }
1156 }
1157
1158 my $cmd = "";
1159 if ($timeout) {$cmd = "ulimit -t $timeout;";}
1160 $output_type =~ s/.*\_(.*)/$1/i;
1161 $cmd .= "perl -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
1162 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
1163 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1164 } else {
1165 $cmd .= " > \"$output_filestem.err\"";
1166 }
1167
1168 # don't include path on windows (to avoid having to play about
1169 # with quoting when GSDLHOME might contain spaces) but assume
1170 # that the PATH is set up correctly
1171 $!=0;
1172 my $retval=system($cmd);
1173 if ($retval!=0)
1174 {
1175 print STDERR "Error executing pdftoimg.pl";
1176 if ($!) {print STDERR ": $!";}
1177 print STDERR "\n";
1178 }
1179
1180 #make sure the converter made something
1181 #if ($retval !=0) || ! -s "$output_filestem")
1182 if ($retval !=0)
1183 {
1184 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1185 #print out the converter's std err, if any
1186 if (-s "$output_filestem.err") {
1187 open (ERRLOG, "$output_filestem.err") || die "$!";
1188 print STDERR "pdfpstoimg error log:\n";
1189 while (<ERRLOG>) {
1190 print STDERR "$_";
1191 }
1192 close ERRLOG;
1193 }
1194 #&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1195 if (-e "$output_filestem.err") {
1196 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1197 {
1198 open (ERRLOG, "$output_filestem.err");
1199 while (<ERRLOG>) {print FAILLOG $_;}
1200 close ERRLOG;
1201 close FAILLOG;
1202 }
1203 &util::rm("$output_filestem.err");
1204 }
1205 return 0;
1206 }
1207 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1208 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1209 return 1;
1210}
1211
1212# Convert a PDF file to text with the pdftotext command
1213
1214sub pdf_to_text {
1215 my ($dirname, $input_filename, $output_filestem) = @_;
1216
1217 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1218
1219 if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1220 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1221 } else {
1222 $cmd .= " > \"$output_filestem.err\"";
1223 }
1224
1225 if (system($cmd)!=0)
1226 {
1227 print STDERR "Error executing $cmd: $!\n";
1228 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1229 }
1230
1231 # make sure there is some extracted text.
1232 if (-e "$output_filestem.text") {
1233 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
1234 binmode(EXTR_TEXT); # just in case...
1235 my $line="";
1236 my $seen_text=0;
1237 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1238 if ($line=~ m/\w/) {$seen_text=1;}
1239 }
1240 close EXTR_TEXT;
1241 if ($seen_text==0) { # no text was extracted
1242 print STDERR "Error: pdftotext found no text\n";
1243 &util::rm("$output_filestem.text");
1244 }
1245 }
1246
1247 # make sure the converter made something
1248 if (! -s "$output_filestem.text")
1249 {
1250 # print out the converters std err, if any
1251 if (-s "$output_filestem.err") {
1252 open (ERRLOG, "$output_filestem.err") || die "$!";
1253 print STDERR "pdftotext error log:\n";
1254 while (<ERRLOG>) {
1255 print STDERR "$_";
1256 }
1257 close ERRLOG;
1258 }
1259 # does this converter create a .out file?
1260 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1261 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1262 if (-e "$output_filestem.err") {
1263 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1264 {
1265 open (ERRLOG,"$output_filestem.err");
1266 while (<ERRLOG>) {print FAILLOG $_;}
1267 close ERRLOG;
1268 close FAILLOG;
1269 }
1270 &util::rm("$output_filestem.err");
1271 }
1272 return 0;
1273 }
1274 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1275 return 1;
1276}
1277
1278# Convert a PostScript document to text
1279# note - just using "ps2ascii" isn't good enough, as it
1280# returns 0 for a postscript interpreter error. ps2ascii is just
1281# a wrapper to "gs" anyway, so we use that cmd here.
1282
1283sub ps_to_text {
1284 my ($input_filename, $output_filestem) = @_;
1285
1286 my $error = "";
1287
1288 # if we're on windows we'll fall straight through without attempting
1289 # to use gs
1290 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1291 $error = "Windows does not support gs";
1292
1293 } else {
1294 my $cmd = "";
1295 if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1296 $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1297 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1298 #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1299 $cmd .= " 2> $output_filestem.err";
1300 $!=0;
1301
1302 my $retcode=system($cmd);
1303 $retcode = $? >> 8; # see man perlfunc - system for this...
1304 # if system returns -1 | 127 (couldn't start program), look at $! for message
1305
1306 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1307 elsif (! -e "$output_filestem.text") {
1308 $error="did not create output file.\n";
1309 }
1310 else
1311 { # make sure the interpreter didn't get an error. It is technically
1312 # possible for the actual text to start with this, but....
1313 open PSOUT, "$output_filestem.text";
1314 if (<PSOUT> =~ m/^Error: (.*)/) {
1315 $error="interpreter error - \"$1\"";
1316 }
1317 close PSOUT;
1318 }
1319 }
1320
1321 if ($error ne "")
1322 {
1323 print STDERR "Warning: Error executing gs: $error\n";
1324 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1325
1326 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1327 {
1328 print FAILLOG "gs - $error\n";
1329 if (-e "$output_filestem.err") {
1330 open(ERRLOG, "$output_filestem.err");
1331 while (<ERRLOG>) {print FAILLOG $_;}
1332 close ERRLOG;
1333 }
1334 close FAILLOG;
1335 }
1336 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1337
1338
1339 # Fine then. We'll just do a lousy job by ourselves...
1340 # Based on 5-line regexp sed script found at:
1341 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1342 #
1343 print STDERR "Stripping text from postscript\n";
1344 my $errorcode=0;
1345 open (IN, "$input_filename")
1346 || ($errorcode=1, warn "Couldn't read file: $!");
1347 open (OUT, ">$output_filestem.text")
1348 || ($errorcode=1, warn "Couldn't write file: $!");
1349 if ($errorcode) {print STDERR "errors\n";return 0;}
1350
1351 my $text=""; # this is for whole .ps file...
1352 $text = join('', <IN>); # see man perlport, under "System Resources"
1353 close IN;
1354
1355 # Make sure this is a ps file...
1356 if ($text !~ m/^%!/) {
1357 print STDERR "Bad postscript header: not '%!'\n";
1358 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1359 {
1360 print FAILLOG "Bad postscript header: not '%!'\n";
1361 close FAILLOG;
1362 }
1363 return 0;
1364 }
1365
1366 # if ps has Page data, then use it to delete all stuff before it.
1367 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1368
1369 # remove all leading non-data stuff
1370 $text =~ s/^.*?\(//s;
1371
1372 # remove all newline chars for easier processing
1373 $text =~ s/\n//g;
1374
1375 # Big assumption here - assume that if any co-ordinates are
1376 # given, then we are at the end of a sentence.
1377 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1378
1379 # special characters--
1380 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1381
1382 # ? ps text formatting (eg italics?) ?
1383 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1384 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1385 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1386 # default - remove the rest
1387 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1388
1389 # attempt to add whitespace between words...
1390 # this is based purely on observation, and may be completely wrong...
1391 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1392 # eg I notice "b(" is sometimes NOT a space if preceded by a
1393 # negative number.
1394 $text =~ s/\)\d+ ?b\(/\) \( /g;
1395
1396 # change quoted braces to brackets
1397 $text =~ s/([^\\])\\\(/$1\{/g;
1398 $text =~ s/([^\\])\\\)/$1\}/g ;
1399
1400 # remove everything that is not between braces
1401 $text =~ s/\)([^\(\)])+?\(//sg ;
1402
1403 # remove any Trailer eof stuff.
1404 $text =~ s/\)[^\)]*$//sg;
1405
1406 ### ligatures have special characters...
1407 $text =~ s/\\013/ff/g;
1408 $text =~ s/\\014/fi/g;
1409 $text =~ s/\\015/fl/g;
1410 $text =~ s/\\016/ffi/g;
1411 $text =~ s/\\214/fi/g;
1412 $text =~ s/\\215/fl/g;
1413 $text =~ s/\\017/\n\* /g; # asterisk?
1414 $text =~ s/\\023/\023/g; # e acute ('e)
1415 $text =~ s/\\177/\252/g; # u"
1416# $text =~ s/ ?? /\344/g; # a"
1417
1418 print OUT "$text";
1419 close OUT;
1420 }
1421 # wrap the text - use a minimum length. ie, first space after this length.
1422 my $wrap_length=72;
1423 &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1424 open INFILE, "$output_filestem.text.tmp" ||
1425 die "Couldn't open file: $!";
1426 open OUTFILE, ">$output_filestem.text" ||
1427 die "Couldn't open file for writing: $!";
1428 my $line="";
1429 while ($line=<INFILE>) {
1430 while (length($line)>0) {
1431 if (length($line)>$wrap_length) {
1432 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1433 print OUTFILE "$1\n";
1434 } else {
1435 print OUTFILE "$line";
1436 $line="";
1437 }
1438 }
1439 }
1440 close INFILE;
1441 close OUTFILE;
1442 &util::rm("$output_filestem.text.tmp");
1443
1444 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1445 return 1;
1446}
1447
1448
1449# Convert any file to HTML with a crude perl implementation of the
1450# UNIX strings command.
1451
1452sub any_to_html {
1453 my ($input_filename, $output_filestem) = @_;
1454
1455 # First generate a text file
1456 return 0 unless (&any_to_text($input_filename, $output_filestem));
1457
1458 # create an HTML file from the text file
1459 open(TEXT, "<$output_filestem.text");
1460 open(HTML, ">$output_filestem.html");
1461
1462 print HTML "<html><head>\n";
1463 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1464 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1465 print HTML "</head><body>\n\n";
1466
1467 my $line;
1468 while ($line=<TEXT>) {
1469 $line =~ s/</&lt;/g;
1470 $line =~ s/>/&gt;/g;
1471 if ($line =~ m/^\s*$/) {
1472 print HTML "<p>";
1473 } else {
1474 print HTML "<br> ", $line;
1475 }
1476 }
1477 print HTML "\n</body></html>\n";
1478
1479 close HTML;
1480 close TEXT;
1481
1482 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1483 return 1;
1484}
1485
1486# Convert any file to TEXT with a crude perl implementation of the
1487# UNIX strings command.
1488# Note - this assumes ascii charsets :( (jrm21)
1489
1490sub any_to_text {
1491 my ($input_filename, $output_filestem) = @_;
1492
1493 if (!$use_strings) {
1494 return 0;
1495 }
1496
1497 print STDERR "\n**** In any to text****\n\n";
1498 open(IN, "<$input_filename") || return 0;
1499 binmode(IN);
1500 open(OUT, ">$output_filestem.text") || return 0;
1501
1502 my ($line);
1503 my $output_line_count = 0;
1504 while (<IN>) {
1505 $line = $_;
1506
1507 # delete anything that isn't a printable character
1508 $line =~ s/[^\040-\176]+/\n/sg;
1509
1510 # delete any string less than 10 characters long
1511 $line =~ s/^.{0,9}$/\n/mg;
1512 while ($line =~ m/^.{1,9}$/m) {
1513 $line =~ s/^.{0,9}$/\n/mg;
1514 $line =~ s/\n+/\n/sg;
1515 }
1516
1517 # remove extraneous whitespace
1518 $line =~ s/\n+/\n/gs;
1519 $line =~ s/^\n//gs;
1520
1521 # output whatever is left
1522 if ($line =~ m/[^\n ]/) {
1523 print OUT $line;
1524 ++$output_line_count;
1525 }
1526 }
1527
1528 close OUT;
1529 close IN;
1530
1531 if ($output_line_count) { # try to protect against binary only formats
1532 return 1;
1533 }
1534
1535 &util::rm("$output_filestem.text");
1536 return 0;
1537
1538}
Note: See TracBrowser for help on using the repository browser.