source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 22568

Last change on this file since 22568 was 22568, checked in by kjdon, 14 years ago

changed part of the usage message

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 48.8 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML or TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999-2002 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. The sources of these are usually found
31# in the $GSDLHOME/packages directory, and the executables should live in
32# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33#
34# Currently, we can convert the following formats by using external
35# conversion utilities:
36# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38#
39# We can try to convert any file to text with a perl implementation of the
40# UNIX strings command.
41#
42# We try to convert Postscript files to text using "gs" which is often on
43# *nix machines. We fall back to performing weak text extraction by using
44# regular expressions.
45
46BEGIN {
47 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49}
50
51use strict;
52
53use parsargv;
54use util;
55use Cwd;
56use File::Basename;
57
58# Are we running on WinNT or Win2000 (or later)?
59my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62my $use_strings;
63my $pdf_complex;
64my $pdf_nohidden;
65my $pdf_zoom;
66my $pdf_ignore_images;
67my $pdf_allow_images_only;
68my $windows_scripting;
69my $openoffice_scripting;
70
71sub print_usage
72{
73 print STDERR "\n";
74 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
75 print STDERR " or text using third-party programs.\n\n";
76 print STDERR " usage: $0 [options] filename\n";
77 if ($openoffice_scripting) {
78 print STDERR " options:\n\t-type\tdoc|dot|docx|odf|pdf|ps|ppt|rtf|xls\t(input file type)\n";
79 }
80 else {
81 print STDERR " options:\n\t-type\tdoc|dot|pdf|ps|ppt|rtf|xls\t(input file type)\n";
82 }
83 print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
84 print STDERR "\t-output\tauto|html|text|pagedimage_jpg|pagedimage_gif|pagedimage_png\t(output file type)\n";
85 print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
86 print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
87 print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
88 print STDERR "\t-openoffice_scripting\tuse OpenOffice (if available) to convert Microsoft Office documents \n";
89 print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
90 print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
91 print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
92 print STDERR "\t\tconverting PDF to HTML\n";
93 print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
94 print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
95 print STDERR "\t\t-pdf_complex is set\n";
96 exit(1);
97}
98
99my $faillogfile="";
100my $timeout=0;
101
102sub main
103{
104 my (@ARGV) = @_;
105 my ($input_type,$output_type,$verbose);
106
107
108 # scan for -openoffice_scripting as it effects the permissible
109 # values for -type
110
111 foreach my $a (@ARGV) {
112 if ($a =~ m/^-openoffice_scripting$/) {
113 $openoffice_scripting = 1;
114 last;
115 }
116 }
117
118 my $parse_type;
119 if ($openoffice_scripting) {
120 $parse_type = 'type/(doc|dot|docx|odf|pdf|ps|ppt|rtf|xls)/';
121 }
122 else {
123 $parse_type = 'type/(doc|dot|pdf|ps|ppt|rtf|xls)/';
124 }
125
126 # read command-line arguments
127 if (!parsargv::parse(\@ARGV,
128 $parse_type, \$input_type,
129 '/errlog/.*/', \$faillogfile,
130 'output/(auto|html|text|pagedimage).*/', \$output_type,
131 'timeout/\d+/0',\$timeout,
132 'verbose/\d+/0', \$verbose,
133 'windows_scripting',\$windows_scripting,
134 'openoffice_scripting',\$openoffice_scripting,
135 'use_strings', \$use_strings,
136 'pdf_complex', \$pdf_complex,
137 'pdf_ignore_images', \$pdf_ignore_images,
138 'pdf_allow_images_only', \$pdf_allow_images_only,
139 'pdf_nohidden', \$pdf_nohidden,
140 'pdf_zoom/\d+/2', \$pdf_zoom
141 ))
142 {
143 print_usage();
144 }
145
146 # Make sure the input file exists and can be opened for reading
147 if (scalar(@ARGV!=1)) {
148 print_usage();
149 }
150
151 my $input_filename = $ARGV[0];
152 if (!-r $input_filename) {
153 print STDERR "Error: unable to open $input_filename for reading\n";
154 exit(1);
155 }
156
157 # Deduce filenames
158 my ($tailname,$dirname,$suffix)
159 = File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
160 my $output_filestem = &util::filename_cat($dirname, "$tailname");
161
162 if ($input_type eq "")
163 {
164 $input_type = lc (substr($suffix,1,length($suffix)-1));
165 }
166
167 # Change to temporary working directory
168 my $stored_dir = cwd();
169 chdir ($dirname) || die "Unable to change to directory $dirname";
170
171 # Select convert utility
172 if (!defined $input_type) {
173 print STDERR "Error: No filename extension or input type defined\n";
174 exit(1);
175 }
176 elsif ($openoffice_scripting && (($input_type eq "docx") || ($input_type eq "odf"))) {
177 print &convertDOC($input_filename, $output_filestem, $output_type);
178 print "\n";
179 }
180 elsif ($input_type eq "doc" || $input_type eq "dot") {
181 print &convertDOC($input_filename, $output_filestem, $output_type);
182 print "\n";
183 }
184 elsif ($input_type eq "rtf") {
185 print &convertRTF($input_filename, $output_filestem, $output_type);
186 print "\n";
187 }
188 elsif ($input_type eq "pdf") {
189 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
190 print "\n";
191 }
192 elsif ($input_type eq "ps") {
193 print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
194 print "\n";
195 }
196 elsif ($input_type eq "ppt") {
197 print &convertPPT($input_filename, $output_filestem, $output_type);
198 print "\n";
199 }
200 elsif ($input_type eq "xls") {
201 print &convertXLS($input_filename, $output_filestem, $output_type);
202 print "\n";
203 }
204 else {
205 print STDERR "Error: Unable to convert type '$input_type'\n";
206 exit(1);
207 }
208
209 # restore to original working directory
210 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
211
212}
213
214&main(@ARGV);
215
216
217
218# Document-type conversion functions
219#
220# The following functions attempt to convert documents from their
221# input type to the specified output type. If no output type was
222# given, then they first attempt HTML, and then TEXT.
223#
224# Each returns the output type ("html" or "text") or "fail" if no
225# conversion is possible.
226
227# Convert a Microsoft word document
228
229sub convertDOC {
230 my ($input_filename, $output_filestem, $output_type) = @_;
231
232 if (($openoffice_scripting) && ($input_filename =~ m/\.docx?$/i)) {
233 # Jump right in and process with Open Office
234 if (openoffice_doc_to_html($input_filename, $output_filestem)) {
235 return "html";
236 }
237 else {
238 return "fail";
239 }
240 }
241
242 # Many .doc files are not in fact word documents!
243 my $realtype = &find_docfile_type($input_filename);
244
245 if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
246 return &convertWord678($input_filename, $output_filestem, $output_type);
247 } elsif ($realtype eq "rtf") {
248 return &convertRTF($input_filename, $output_filestem, $output_type);
249 } else {
250 return &convertAnything($input_filename, $output_filestem, $output_type);
251 }
252}
253
254# Convert a Microsoft word 6/7/8 document
255
256sub convertWord678 {
257 my ($input_filename, $output_filestem, $output_type) = @_;
258
259 my $success = 0;
260 if (!$output_type || ($output_type =~ m/html/i)){
261 if ($windows_scripting) {
262 $success = &native_doc_to_html($input_filename, $output_filestem);
263 }
264 elsif ($openoffice_scripting) {
265 $success = &openoffice_doc_to_html($input_filename, $output_filestem);
266 }
267 else {
268 $success = &doc_to_html($input_filename, $output_filestem);
269 }
270 if ($success) {
271 return "html";
272 }
273 }
274 return &convertAnything($input_filename, $output_filestem, $output_type);
275}
276
277
278# Convert a Rich Text Format (RTF) file
279
280sub convertRTF {
281 my ($input_filename, $output_filestem, $output_type) = @_;
282
283 my $success = 0;
284
285 # Attempt specialised conversion to HTML
286 if (!$output_type || ($output_type =~ m/html/i)) {
287
288 if ($windows_scripting) {
289 $success = &native_doc_to_html($input_filename, $output_filestem);
290 }
291 elsif ($openoffice_scripting) {
292 $success = &openoffice_doc_to_html($input_filename, $output_filestem);
293 }
294 else {
295 $success = &rtf_to_html($input_filename, $output_filestem);
296 }
297 if ($success) {
298 return "html";
299 }
300 }
301
302# rtf is so ugly that's it's not worth running strings over.
303# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
304# return &convertAnything($input_filename, $output_filestem, $output_type);
305 return "fail";
306}
307
308
309# Convert an unidentified file
310
311sub convertAnything {
312 my ($input_filename, $output_filestem, $output_type) = @_;
313
314 my $success = 0;
315
316 # Attempt simple conversion to HTML
317 if (!$output_type || ($output_type =~ m/html/i)) {
318 $success = &any_to_html($input_filename, $output_filestem);
319 if ($success) {
320 return "html";
321 }
322 }
323
324 # Convert to text
325 if (!$output_type || ($output_type =~ m/text/i)) {
326 $success = &any_to_text($input_filename, $output_filestem);
327 if ($success) {
328 return "text";
329 }
330 }
331 return "fail";
332}
333
334
335
336# Convert an Adobe PDF document
337
338sub convertPDF {
339 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
340
341 my $success = 0;
342 $output_type =~ s/.*\-(.*)/$1/i;
343 # Attempt coversion to Image
344 if ($output_type =~ m/jp?g|gif|png/i) {
345 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
346 if ($success){
347 return "item";
348 }
349 }
350
351 # Attempt conversion to HTML
352 if (!$output_type || ($output_type =~ m/html/i)) {
353 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
354 if ($success) {
355 return "html";
356 }
357 }
358
359 # Attempt conversion to TEXT
360 if (!$output_type || ($output_type =~ m/text/i)) {
361 $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
362 if ($success) {
363 return "text";
364 }
365 }
366
367 return "fail";
368
369}
370
371
372# Convert an Adobe PostScript document
373
374sub convertPS {
375 my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
376
377 my $success = 0;
378 $output_type =~ s/.*\-(.*)/$1/i;
379 # Attempt coversion to Image
380 if ($output_type =~ m/jp?g|gif|png/i) {
381 $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
382 if ($success){
383 return "item";
384 }
385 }
386
387 # Attempt conversion to TEXT
388 if (!$output_type || ($output_type =~ m/text/i)) {
389 $success = &ps_to_text($input_filename, $output_filestem);
390 if ($success) {
391 return "text";
392 }
393 }
394 return "fail";
395}
396
397
398sub convertPPT {
399 my ($input_filename, $output_filestem, $output_type) = @_;
400 my $success = 0;
401
402 my $ppt_convert_type = "";
403
404 if ($openoffice_scripting) {
405 # Jump right in and process with Open Office
406 if (openoffice_doc_to_html($input_filename, $output_filestem)) {
407 return "html";
408 }
409 else {
410 return "fail";
411 }
412 }
413
414 #if (!$output_type || $windows_scripting || ($output_type !~ m/html/i) || ($output_type !~ m/text/i)){
415 if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
416 if ($output_type =~ m/gif/i) {
417 $ppt_convert_type = "-g";
418 } elsif ($output_type =~ m/jp?g/i){
419 $ppt_convert_type = "-j";
420 } elsif ($output_type =~ m/png/i){
421 $ppt_convert_type = "-p";
422 }
423 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
424 $ENV{'GSDLOS'}, "pptextract");
425 $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
426
427 my $cmd = "";
428 if ($timeout) {$cmd = "ulimit -t $timeout;";}
429 # if the converting directory already exists
430 if (-d $output_filestem) {
431 print STDERR "**The conversion directory already exists\n";
432 return "item";
433 } else {
434 $cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
435 $cmd .= " 2>\"$output_filestem.err\""
436 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
437 if (system($cmd) !=0) {
438 print STDERR "Powerpoint VB Scripting convert failed\n";
439 } else {
440 return "item";
441 }
442 }
443 } elsif (!$output_type || ($output_type =~ m/html/i)) {
444 # Attempt conversion to HTML
445 #if (!$output_type || ($output_type =~ m/html/i)) {
446 # formulate the command
447 my $cmd = "";
448 $cmd .= "perl -S ppttohtml.pl ";
449 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
450 $cmd .= " 2>\"$output_filestem.err\""
451 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
452
453 # execute the command
454 $!=0;
455 if (system($cmd)!=0)
456 {
457 print STDERR "Powerpoint 95/97 converter failed $!\n";
458 } else {
459 return "html";
460 }
461 }
462
463 $success = &any_to_text($input_filename, $output_filestem);
464 if ($success) {
465 return "text";
466 }
467
468 return "fail";
469}
470
471
472sub convertXLS {
473 my ($input_filename, $output_filestem, $output_type) = @_;
474
475 my $success = 0;
476
477 if ($openoffice_scripting) {
478 # Jump right in and process with Open Office
479 if (openoffice_doc_to_html($input_filename, $output_filestem)) {
480 return "html";
481 }
482 else {
483 return "fail";
484 }
485 }
486
487 # Attempt conversion to HTML
488 if (!$output_type || ($output_type =~ m/html/i)) {
489 # formulate the command
490 my $cmd = "";
491 $cmd .= "perl -S xlstohtml.pl ";
492 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
493 $cmd .= " 2>\"$output_filestem.err\""
494 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
495
496
497 # execute the command
498 $!=0;
499 if (system($cmd)!=0)
500 {
501 print STDERR "Excel 95/97 converter failed $!\n";
502 } else {
503 return "html";
504 }
505 }
506
507 $success = &any_to_text($input_filename, $output_filestem);
508 if ($success) {
509 return "text";
510 }
511
512 return "fail";
513}
514
515
516
517# Find the real type of a .doc file
518#
519# We seem to have a lot of files with a .doc extension that are .rtf
520# files or Word 5 files. This function attempts to tell the difference.
521sub find_docfile_type {
522 my ($input_filename) = @_;
523
524 open(CHK, "<$input_filename");
525 binmode(CHK);
526 my $line = "";
527 my $first = 1;
528
529 while (<CHK>) {
530
531 $line = $_;
532
533 if ($first) {
534 # check to see if this is an rtf file
535 if ($line =~ m/^\{\\rtf/) {
536 close(CHK);
537 return "rtf";
538 }
539 $first = 0;
540 }
541
542 # is this is a word 6/7/8 document?
543 if ($line =~ m/Word\.Document\.([678])/) {
544 close(CHK);
545 return "word$1";
546 }
547
548 }
549
550 return "unknown";
551}
552
553
554# Specific type-to-type conversions
555#
556# Each of the following functions attempts to convert a document from
557# a specific format to another. If they succeed they return 1 and leave
558# the output document(s) in the appropriate place; if they fail they
559# return 0 and delete any working files.
560
561
562# Attempt to convert a word document to html with the wv program
563sub doc_to_html {
564 my ($input_filename, $output_filestem) = @_;
565
566 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
567
568 if ( -d "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv" && $ENV{'GSDLOS'} eq "linux" ) {
569 $ENV{'PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/bin:$ENV{'PATH'}";
570 $ENV{'LD_LIBRARY_PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/lib:$ENV{'LD_LIBRARY_PATH'}";
571 $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv", "bin", "wvWare");
572 }
573
574 # don't include path on windows (to avoid having to play about
575 # with quoting when GSDLHOME might contain spaces) but assume
576 # that the PATH is set up correctly
577 $wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
578
579 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
580 "packages", "wv", "wvHtml.xml");
581
582 # Added the following to work with replace_srcdoc_with_html.pl:
583 # Make wvWare put any associated (image) files of the word doc into
584 # folder docname-without-extention_files. This folder should be at
585 # the same level as the html file generated from the doc.
586 # wvWare will take care of proper interlinking.
587
588 # This step is necessary for replace_srcdoc_with_html.pl which will
589 # move the html and associated files into the import folder. We
590 # want to ensure that the associated files won't overwrite similarly
591 # named items already in import. Hence we put them in a folder first
592 # (to which the html links properly) and that will allow
593 # replace_srcdoc_with_html.pl to move them safely to /import.
594
595 # To do all this, we need to use wvWare's --dir and --basename options
596 # where dir is the full path to the image folder directory and
597 # basename is the full path to the image folder appended to the name
598 # which is to be prepended to every image file:
599 # eg. if the images were to have names like sample0.jpg to sampleN.jpg,
600 # then the basename is "/full/path/to/imgdir/sample".
601 # In this case, basename is the full path to and name of the document.
602 # HOWEVER: basename always takes full path, not relative url, so
603 # the greenstone browser is unable to display the images (absolute paths
604 # cause it to give an "external link" message)
605 # See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
606 # and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
607 # "added --dir option to wvHtml so that pictures can be placed in
608 # a seperate directory"
609 # "running wvWare through IMP to view word documents as html. It gets
610 # invoked like this:
611 # wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
612
613 # toppath is the folder where html is generated
614 # docname is the name (without extension) of the html to be generated
615 # suffix (extension) is thrown away
616 my ($docname, $toppath)
617 = &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
618
619 # We want the image folder generated to have the same name as windows
620 # would generate ($windows_scripting) when it converts from word to html.
621 # That is, foldername=docname_files
622 my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
623 #print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
624
625 # ensure this image directory exists
626 # if it exists already, just delete and recreate
627 if(-e $assoc_dir) {
628 &util::rm_r($assoc_dir);
629 }
630 &util::mk_dir($assoc_dir);
631
632 # the images are all going to be called image0, image1,..., imageN
633 my $img_basenames = &util::filename_cat($assoc_dir, $docname);
634
635 #print STDERR "****toppath: $toppath\n****docname: $docname\n;
636 #print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
637 #print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
638
639 my $cmd = "";
640 if ($timeout) {$cmd = "ulimit -t $timeout;";}
641 # wvWare's --dir and --basename options for image directory.
642 # Replaced the next line with the *2 lines* following it:
643 # $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
644 $cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
645 $cmd .= " --charset utf-8 --config \"$wv_conf\"";
646 $cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
647
648 # redirecting STDERR is a bad idea on windows 95/98
649 $cmd .= " 2> \"$output_filestem.err\""
650 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
651 # execute the command
652 $!=0;
653 if (system($cmd)!=0)
654 {
655 print STDERR "Error executing wv converter:$!\n";
656 if (-s "$output_filestem.err") {
657 open (ERRFILE, "<$output_filestem.err");
658
659 my $write_to_fail_log=0;
660 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
661 {$write_to_fail_log=1;}
662
663 my $line;
664 while ($line=<ERRFILE>) {
665 if ($line =~ m/\w/) {
666 print STDERR "$line";
667 print FAILLOG "$line" if ($write_to_fail_log);
668 }
669 if ($line !~ m/startup error/) {next;}
670 print STDERR " (given an invalid .DOC file?)\n";
671 print FAILLOG " (given an invalid .DOC file?)\n"
672 if ($write_to_fail_log);
673
674 } # while ERRFILE
675 close FAILLOG if ($write_to_fail_log);
676 }
677 return 0; # we can try any_to_text
678 }
679
680 # Was the conversion successful?
681
682 if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
683 open(TMP, "$output_filestem.html");
684 my $line = <TMP>;
685 close(TMP);
686 if ($line && $line =~ m/DOCTYPE HTML/) {
687 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
688
689 # Inserted this code to remove the images directory if it was still empty after
690 # the html was generated (in case there were no images in the word document)
691 if (&util::is_dir_empty($assoc_dir)) {
692 #print STDERR "***gsConvert.pl: Image dir $assoc_dir is empty, removing***\n";
693 &util::rm_r($assoc_dir);
694 } else { # there was an image folder (it was generated)
695 # Therefore, the html file generated contains absolute links to the images
696 # Replace them with relative links instead, so the folder can be moved elsewhere
697 &make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
698 }
699 return 1;
700 }
701 }
702
703 # If here, an error of some sort occurred
704 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
705 if (-e "$output_filestem.err") {
706 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
707 open (ERRLOG,"$output_filestem.err");
708 while (<ERRLOG>) {print FAILLOG $_;}
709 close FAILLOG;
710 close ERRLOG;
711 }
712 &util::rm("$output_filestem.err");
713 }
714
715 return 0;
716}
717
718# Method to work with doc_to_html - Word docs might contain images.
719# When such word docs are converted with wvWare, we make it generate a
720# <filename>_files folder with the associated images, while the html file
721# <filename> refers to the images using absolute paths to <filename>_files.
722# This method reads in that html file and replaces all the absolute paths to
723# the images in <filename>_files with the relative paths to the images from
724# that folder. (I.e. with <filename>_files/<imagename.ext>).
725sub make_links_to_assocdir_relative{
726 # toppath is the top-level folder in which the html file we're going to be fixing resides
727 # docname is just the name (without extension) of the html file
728 # html_file is the full path to the html file: /full/path/docname.html
729 # assoc_dir_path is toppath/docname_files
730 # assoc_dirname is the directory name of the folder with associated imgs: docname_files
731 my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
732
733 # 1. Read all the contents of the html into a string
734 # open the original file for reading
735 unless(open(FIN, "<$html_file")) {
736 print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
737 return 0;
738 }
739 # From http://perl.plover.com/local.html
740 # "It's cheaper to read the file all at once, without all the splitting and reassembling.
741 # (Some people call this slurping the file.) Perl has a special feature to support this:
742 # If the $/ variable is undefined, the <...> operator will read the entire file all at once"
743 my $html_contents;
744 {
745 local $/ = undef; # Read entire file at once
746 $html_contents = <FIN>; # Now file is read in as one single 'line'
747 }
748 close(FIN); # close the file
749 #print STDERR $html_contents;
750
751 # 2. Replace (substitute) *all* ocurrences of the assoc_dir_path in a hrefs and img src
752 # values with assoc_dirname
753 # At the end: g means substitute all occurrences (global), while s at the end means treat
754 # all new lines as a regular space. This interacts with g to consider all the lines
755 # together as a single line so that multi-occurrences can be replaced.
756
757 # we can't just replace $assoc_dir_path with $assoc_dir
758 # $assoc_dir_path represents a regular expression that needs to be replaced
759 # if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
760 # meaning in Perl regular expressions -- we need to escape these first
761 my $safe_reg_expression = $assoc_dir_path;
762 $safe_reg_expression =~ s/\\/\\\\/g;
763 $safe_reg_expression =~ s/\./\\./g;
764 $safe_reg_expression =~ s/\-/\\-/g;
765 $safe_reg_expression =~ s/\[/\\[/g;
766 $safe_reg_expression =~ s/\]/\\]/g;
767 $safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
768
769 # The following regular expression substitution looks for <a or <image, followed by any other
770 # attributes and values until it comes to the FIRST (indicated by ?) href= or src=
771 # followed by " or ' no quotes at all around path, followed by the associated folder's pathname
772 # followed by characters (for the img filename), then finally the optional closing quotes
773 # in " or ' form, followed by any other attributes and values until the first > to end the tag.
774 # The substitution: all the parts preceding associated folder's pathname are retained,
775 # the associated folder path name is replaced by associated folder directory name
776 # and the rest upto and including the closing > tag is retained.
777 # The sg at the end of the pattern match treats all of html_contents as a single line (s)
778 # and performs a global replace (g) meaning that all occurrences that match in that single line
779 # are substituted.
780 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)$safe_reg_expression(.*?(\"|\')?.*?>)/$1$assoc_dirname$5/sg;
781 #$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
782 # now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
783 $html_contents =~ s/(<(a|img).*?(href|src)=(\"|\')?)(.*)(.*?(\"|\')?.*?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
784
785 #print STDERR "****assoc_dirname: $assoc_dirname***\n";
786 #print STDERR "****safe_reg_expression: $safe_reg_expression***\n";
787
788 # delete the original file and recreate it
789 my $copy_of_filename = $html_file;
790 &util::rm($copy_of_filename); # deleted the file
791
792 # Recreate the original file for writing the updated contents
793 unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
794 print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
795 return 0;
796 }
797
798 # write out the updated contents and close the file
799 print FOUT $html_contents;
800 close(FOUT);
801 return 1;
802}
803
804# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
805# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
806# introduced in link pathnames by wvWare into space again. Converts all percent signs
807# introduced by URL encoding filenames generated into %25 in these url links referencing them
808sub post_process_assocfile_urls
809{
810 my ($pre, $text, $post) = @_;
811
812 $text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
813 # $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
814 $text =~ s/\\/\//g;
815 $text =~ s/%/%25/g;
816
817 return "$pre$text$post";
818}
819
820# Attempt to convert a word document to html with the word2html scripting program
821sub native_doc_to_html {
822 my ($input_filename, $output_filestem) = @_;
823
824 my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
825 $ENV{'GSDLOS'}, "word2html");
826
827 $vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
828 if (-e "$output_filestem.html") {
829 print STDERR " The conversion file:\n";
830 print STDERR " $output_filestem.html\n";
831 print STDERR " ... already exists. Skipping\n";
832 return 1;
833 }
834
835 my $cmd = "";
836 if ($timeout) {$cmd = "ulimit -t $timeout;";}
837 #$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
838 #$cmd .= "$vbScript $input_filename $output_filestem.html";
839 $cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
840
841 # redirecting STDERR
842 $cmd .= " 2> \"$output_filestem.err\""
843 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
844
845 # execute the command
846 $!=0;
847 if (system($cmd)!=0)
848 {
849 print STDERR "Error executing word2Html converter:$!\n";
850 if (-s "$output_filestem.err") {
851 open (ERRFILE, "<$output_filestem.err");
852
853 my $write_to_fail_log=0;
854 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
855 {$write_to_fail_log=1;}
856
857 my $line;
858 while ($line=<ERRFILE>) {
859 if ($line =~ m/\w/) {
860 print STDERR "$line";
861 print FAILLOG "$line" if ($write_to_fail_log);
862 }
863 if ($line !~ m/startup error/) {next;}
864 print STDERR " (given an invalid .DOC file?)\n";
865 print FAILLOG " (given an invalid .DOC file?)\n"
866 if ($write_to_fail_log);
867
868 } # while ERRFILE
869 close FAILLOG if ($write_to_fail_log);
870 }
871 return 0; # we can try any_to_text
872 }
873
874 # Was the conversion successful?
875 if (-s "$output_filestem.html") {
876 open(TMP, "$output_filestem.html");
877 my $line = <TMP>;
878 close(TMP);
879 if ($line && $line =~ m/html/i) {
880 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
881 return 1;
882 }
883 }
884
885 # If here, an error of some sort occurred
886 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
887 if (-e "$output_filestem.err") {
888 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
889 open (ERRLOG,"$output_filestem.err");
890 while (<ERRLOG>) {print FAILLOG $_;}
891 close FAILLOG;
892 close ERRLOG;
893 }
894 &util::rm("$output_filestem.err");
895 }
896 return 0;
897}
898
899# Attempt to convert a word document to html with JODConvert scripting program
900sub openoffice_doc_to_html {
901 my ($input_filename, $output_filestem) = @_;
902
903 if (-e "$output_filestem.html") {
904 print STDERR " The conversion file:\n";
905 print STDERR " $output_filestem.html\n";
906 print STDERR " ... skipping\n";
907 return 1;
908 }
909
910 my $oo_script_dir = &util::filename_cat($ENV{'GEXT_OPENOFFICE'}, "bin", "script");
911 my $oo2html = &util::filename_cat($oo_script_dir,"oo2html");
912 if (!-e $oo2html) {
913 print STDERR "Error: Unable to find 'oo2html' in: \n";
914 print STDERR " $oo_script_dir\n";
915 print STDERR " Is the OpenOffice extension to Greenstone installed?\n";
916 return 0;
917 }
918
919 my $cmd = "";
920 if ($timeout) {$cmd = "ulimit -t $timeout;";}
921 $cmd .= "$oo2html \"$input_filename\" \"$output_filestem.html\"";
922
923 # redirecting STDERR
924 $cmd .= " 2> \"$output_filestem.err\""
925 if ($ENV {'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
926
927 # execute the command
928 $!=0;
929 if (system($cmd)!=0)
930 {
931 print STDERR "Error executing oo2html converter: $!\n";
932 print STDERR "Command was: $cmd\n";
933
934 if (-s "$output_filestem.err") {
935 open (ERRFILE, "<$output_filestem.err");
936
937 my $write_to_fail_log=0;
938 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
939 {$write_to_fail_log=1;}
940
941 my $line;
942 while ($line=<ERRFILE>) {
943 if ($line =~ m/\w/) {
944 print STDERR "$line";
945 print FAILLOG "$line" if ($write_to_fail_log);
946 }
947 if ($line !~ m/startup error/) {next;}
948 print STDERR " (given an invalid .DOC file?)\n";
949 print FAILLOG " (given an invalid .DOC file?)\n"
950 if ($write_to_fail_log);
951
952 } # while ERRFILE
953 close FAILLOG if ($write_to_fail_log);
954 }
955 return 0; # we can try any_to_text
956 }
957
958 # Was the conversion successful?
959 if (-s "$output_filestem.html") {
960 open(TMP, "$output_filestem.html");
961 my $line = <TMP>;
962 close(TMP);
963 if ($line && $line =~ m/html/i) {
964 &util::rm("$output_filestem.err") if -e "$output_filestem.err";
965 return 1;
966 }
967 }
968
969 # If here, an error of some sort occurred
970
971 &util::rm("$output_filestem.html") if -e "$output_filestem.html";
972 if (-e "$output_filestem.err") {
973 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
974 open (ERRLOG,"$output_filestem.err");
975 while (<ERRLOG>) {print FAILLOG $_;}
976 close FAILLOG;
977 close ERRLOG;
978 }
979 &util::rm("$output_filestem.err");
980 }
981 return 0;
982}
983
984# Attempt to convert an RTF document to html with rtftohtml
985sub rtf_to_html {
986 my ($input_filename, $output_filestem) = @_;
987
988 # formulate the command
989 my $cmd = "";
990 if ($timeout) {$cmd = "ulimit -t $timeout;";}
991 $cmd .= "rtftohtml";
992 #$cmd .= "rtf-converter";
993
994 $cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
995
996 $cmd .= " 2>\"$output_filestem.err\""
997 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000);
998
999
1000 # execute the command
1001 $!=0;
1002 if (system($cmd)!=0)
1003 {
1004 print STDERR "Error executing rtf converter $!\n";
1005 # don't currently bother printing out error log...
1006 # keep going, in case it still created an HTML file...
1007 }
1008
1009 # Was the conversion successful?
1010 my $was_successful=0;
1011 if (-s "$output_filestem.html") {
1012 # make sure we have some content other than header
1013 open (HTML, "$output_filestem.html"); # what to do if fail?
1014 my $line;
1015 my $past_header=0;
1016 while ($line=<HTML>) {
1017
1018 if ($past_header == 0) {
1019 if ($line =~ m/<body>/) {$past_header=1;}
1020 next;
1021 }
1022
1023 $line =~ s/<[^>]+>//g;
1024 if ($line =~ m/\w/ && $past_header) { # we found some content...
1025 $was_successful=1;
1026 last;
1027 }
1028 }
1029 close HTML;
1030 }
1031
1032 if ($was_successful) {
1033 &util::rm("$output_filestem.err")
1034 if (-e "$output_filestem.err");
1035 # insert the (modified) table of contents, if it exists.
1036 if (-e "${output_filestem}_ToC.html") {
1037 &util::mv("$output_filestem.html","$output_filestem.src");
1038 my $open_failed=0;
1039 open HTMLSRC, "$output_filestem.src" || ++$open_failed;
1040 open TOC, "${output_filestem}_ToC.html" || ++$open_failed;
1041 open HTML, ">$output_filestem.html" || ++$open_failed;
1042
1043 if ($open_failed) {
1044 close HTMLSRC;
1045 close TOC;
1046 close HTML;
1047 &util::mv("$output_filestem.src","$output_filestem.html");
1048 return 1;
1049 }
1050
1051 # print out header info from src html.
1052 while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
1053 print HTML "$_";
1054 }
1055
1056 # print out table of contents, making links relative
1057 <TOC>; <TOC>; # ignore first 2 lines
1058 print HTML scalar(<TOC>); # line 3 = "<ol>\n"
1059 my $line;
1060 while ($line=<TOC>) {
1061 $line =~ s@</body></html>$@@i ; # only last line has this
1062 # make link relative
1063 $line =~ s@href=\"[^\#]+@href=\"@i;
1064 print HTML $line;
1065 }
1066 close TOC;
1067
1068 # rest of html src
1069 while (<HTMLSRC>) {
1070 print HTML $_;
1071 }
1072 close HTMLSRC;
1073 close HTML;
1074
1075 &util::rm("${output_filestem}_ToC.html");
1076 &util::rm("${output_filestem}.src");
1077 }
1078 # we don't yet do anything with footnotes ($output_filestem_fn.html) :(
1079 return 1; # success
1080 }
1081
1082 if (-e "$output_filestem.err") {
1083 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1084 {
1085 print FAILLOG "Error - rtftohtml - couldn't extract text\n";
1086 #print FAILLOG "Error - rtf-converter - couldn't extract text\n";
1087 print FAILLOG " (rtf file might be too recent):\n";
1088 open (ERRLOG, "$output_filestem.err");
1089 while (<ERRLOG>) {print FAILLOG $_;}
1090 close ERRLOG;
1091 close FAILLOG;
1092 }
1093 &util::rm("$output_filestem.err");
1094 }
1095
1096 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1097
1098 return 0;
1099}
1100
1101
1102# Convert a pdf file to html with the pdftohtml command
1103
1104sub pdf_to_html {
1105 my ($dirname, $input_filename, $output_filestem) = @_;
1106
1107 my $cmd = "";
1108 if ($timeout) {$cmd = "ulimit -t $timeout;";}
1109 $cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
1110 $cmd .= " -c" if ($pdf_complex);
1111 $cmd .= " -i" if ($pdf_ignore_images);
1112 $cmd .= " -a" if ($pdf_allow_images_only);
1113 $cmd .= " -hidden" unless ($pdf_nohidden);
1114 $cmd .= " \"$input_filename\" \"$output_filestem\"";
1115
1116 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
1117 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1118 } else {
1119 $cmd .= " > \"$output_filestem.err\"";
1120 }
1121
1122 $!=0;
1123
1124 my $retval=system($cmd);
1125 if ($retval!=0)
1126 {
1127 print STDERR "Error executing pdftohtml.pl";
1128 if ($!) {print STDERR ": $!";}
1129 print STDERR "\n";
1130 }
1131
1132 # make sure the converter made something
1133 if ($retval!=0 || ! -s "$output_filestem.html")
1134 {
1135 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1136 # print out the converter's std err, if any
1137 if (-s "$output_filestem.err") {
1138 open (ERRLOG, "$output_filestem.err") || die "$!";
1139 print STDERR "pdftohtml error log:\n";
1140 while (<ERRLOG>) {
1141 print STDERR "$_";
1142 }
1143 close ERRLOG;
1144 }
1145 print STDERR "***********output filestem $output_filestem.html\n";
1146 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1147 if (-e "$output_filestem.err") {
1148 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1149 {
1150 open (ERRLOG, "$output_filestem.err");
1151 while (<ERRLOG>) {print FAILLOG $_;}
1152 close ERRLOG;
1153 close FAILLOG;
1154 }
1155 &util::rm("$output_filestem.err");
1156 }
1157 return 0;
1158 }
1159
1160 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1161 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1162 return 1;
1163}
1164
1165# Convert a pdf file to various types of image with the convert command
1166
1167sub pdfps_to_img {
1168 my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
1169
1170 # Check that ImageMagick is installed and available on the path (except for Windows 95/98)
1171 if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
1172 my $result = `identify 2>&1`;
1173 if ($? == -1 || $? == 256) { # Linux and Windows return different values for "program not found"
1174 #ImageMagick is not installed, thus the convert utility is not available.
1175 print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
1176 return 0;
1177 }
1178 }
1179
1180 my $cmd = "";
1181 if ($timeout) {$cmd = "ulimit -t $timeout;";}
1182 $output_type =~ s/.*\_(.*)/$1/i;
1183 $cmd .= "perl -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
1184 if ($ENV{'GSDLOS'} !~ m/^windows$/i || $is_winnt_2000) {
1185 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1186 } else {
1187 $cmd .= " > \"$output_filestem.err\"";
1188 }
1189
1190 # don't include path on windows (to avoid having to play about
1191 # with quoting when GSDLHOME might contain spaces) but assume
1192 # that the PATH is set up correctly
1193 $!=0;
1194 my $retval=system($cmd);
1195 if ($retval!=0)
1196 {
1197 print STDERR "Error executing pdftoimg.pl";
1198 if ($!) {print STDERR ": $!";}
1199 print STDERR "\n";
1200 }
1201
1202 #make sure the converter made something
1203 #if ($retval !=0) || ! -s "$output_filestem")
1204 if ($retval !=0)
1205 {
1206 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1207 #print out the converter's std err, if any
1208 if (-s "$output_filestem.err") {
1209 open (ERRLOG, "$output_filestem.err") || die "$!";
1210 print STDERR "pdfpstoimg error log:\n";
1211 while (<ERRLOG>) {
1212 print STDERR "$_";
1213 }
1214 close ERRLOG;
1215 }
1216 #&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1217 if (-e "$output_filestem.err") {
1218 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1219 {
1220 open (ERRLOG, "$output_filestem.err");
1221 while (<ERRLOG>) {print FAILLOG $_;}
1222 close ERRLOG;
1223 close FAILLOG;
1224 }
1225 &util::rm("$output_filestem.err");
1226 }
1227 return 0;
1228 }
1229 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1230 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1231 return 1;
1232}
1233
1234# Convert a PDF file to text with the pdftotext command
1235
1236sub pdf_to_text {
1237 my ($dirname, $input_filename, $output_filestem) = @_;
1238
1239 my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1240
1241 if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1242 $cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1243 } else {
1244 $cmd .= " > \"$output_filestem.err\"";
1245 }
1246
1247 if (system($cmd)!=0)
1248 {
1249 print STDERR "Error executing $cmd: $!\n";
1250 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1251 }
1252
1253 # make sure there is some extracted text.
1254 if (-e "$output_filestem.text") {
1255 open (EXTR_TEXT, "$output_filestem.text") || warn "open: $!";
1256 binmode(EXTR_TEXT); # just in case...
1257 my $line="";
1258 my $seen_text=0;
1259 while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1260 if ($line=~ m/\w/) {$seen_text=1;}
1261 }
1262 close EXTR_TEXT;
1263 if ($seen_text==0) { # no text was extracted
1264 print STDERR "Error: pdftotext found no text\n";
1265 &util::rm("$output_filestem.text");
1266 }
1267 }
1268
1269 # make sure the converter made something
1270 if (! -s "$output_filestem.text")
1271 {
1272 # print out the converters std err, if any
1273 if (-s "$output_filestem.err") {
1274 open (ERRLOG, "$output_filestem.err") || die "$!";
1275 print STDERR "pdftotext error log:\n";
1276 while (<ERRLOG>) {
1277 print STDERR "$_";
1278 }
1279 close ERRLOG;
1280 }
1281 # does this converter create a .out file?
1282 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1283 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1284 if (-e "$output_filestem.err") {
1285 if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1286 {
1287 open (ERRLOG,"$output_filestem.err");
1288 while (<ERRLOG>) {print FAILLOG $_;}
1289 close ERRLOG;
1290 close FAILLOG;
1291 }
1292 &util::rm("$output_filestem.err");
1293 }
1294 return 0;
1295 }
1296 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1297 return 1;
1298}
1299
1300# Convert a PostScript document to text
1301# note - just using "ps2ascii" isn't good enough, as it
1302# returns 0 for a postscript interpreter error. ps2ascii is just
1303# a wrapper to "gs" anyway, so we use that cmd here.
1304
1305sub ps_to_text {
1306 my ($input_filename, $output_filestem) = @_;
1307
1308 my $error = "";
1309
1310 # if we're on windows we'll fall straight through without attempting
1311 # to use gs
1312 if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1313 $error = "Windows does not support gs";
1314
1315 } else {
1316 my $cmd = "";
1317 if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1318 $cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1319 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1320 #$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1321 $cmd .= " 2> $output_filestem.err";
1322 $!=0;
1323
1324 my $retcode=system($cmd);
1325 $retcode = $? >> 8; # see man perlfunc - system for this...
1326 # if system returns -1 | 127 (couldn't start program), look at $! for message
1327
1328 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1329 elsif (! -e "$output_filestem.text") {
1330 $error="did not create output file.\n";
1331 }
1332 else
1333 { # make sure the interpreter didn't get an error. It is technically
1334 # possible for the actual text to start with this, but....
1335 open PSOUT, "$output_filestem.text";
1336 if (<PSOUT> =~ m/^Error: (.*)/) {
1337 $error="interpreter error - \"$1\"";
1338 }
1339 close PSOUT;
1340 }
1341 }
1342
1343 if ($error ne "")
1344 {
1345 print STDERR "Warning: Error executing gs: $error\n";
1346 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1347
1348 if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1349 {
1350 print FAILLOG "gs - $error\n";
1351 if (-e "$output_filestem.err") {
1352 open(ERRLOG, "$output_filestem.err");
1353 while (<ERRLOG>) {print FAILLOG $_;}
1354 close ERRLOG;
1355 }
1356 close FAILLOG;
1357 }
1358 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1359
1360
1361 # Fine then. We'll just do a lousy job by ourselves...
1362 # Based on 5-line regexp sed script found at:
1363 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1364 #
1365 print STDERR "Stripping text from postscript\n";
1366 my $errorcode=0;
1367 open (IN, "$input_filename")
1368 || ($errorcode=1, warn "Couldn't read file: $!");
1369 open (OUT, ">$output_filestem.text")
1370 || ($errorcode=1, warn "Couldn't write file: $!");
1371 if ($errorcode) {print STDERR "errors\n";return 0;}
1372
1373 my $text=""; # this is for whole .ps file...
1374 $text = join('', <IN>); # see man perlport, under "System Resources"
1375 close IN;
1376
1377 # Make sure this is a ps file...
1378 if ($text !~ m/^%!/) {
1379 print STDERR "Bad postscript header: not '%!'\n";
1380 if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1381 {
1382 print FAILLOG "Bad postscript header: not '%!'\n";
1383 close FAILLOG;
1384 }
1385 return 0;
1386 }
1387
1388 # if ps has Page data, then use it to delete all stuff before it.
1389 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
1390
1391 # remove all leading non-data stuff
1392 $text =~ s/^.*?\(//s;
1393
1394 # remove all newline chars for easier processing
1395 $text =~ s/\n//g;
1396
1397 # Big assumption here - assume that if any co-ordinates are
1398 # given, then we are at the end of a sentence.
1399 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1400
1401 # special characters--
1402 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
1403
1404 # ? ps text formatting (eg italics?) ?
1405 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1406 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1407 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
1408 # default - remove the rest
1409 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1410
1411 # attempt to add whitespace between words...
1412 # this is based purely on observation, and may be completely wrong...
1413 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1414 # eg I notice "b(" is sometimes NOT a space if preceded by a
1415 # negative number.
1416 $text =~ s/\)\d+ ?b\(/\) \( /g;
1417
1418 # change quoted braces to brackets
1419 $text =~ s/([^\\])\\\(/$1\{/g;
1420 $text =~ s/([^\\])\\\)/$1\}/g ;
1421
1422 # remove everything that is not between braces
1423 $text =~ s/\)([^\(\)])+?\(//sg ;
1424
1425 # remove any Trailer eof stuff.
1426 $text =~ s/\)[^\)]*$//sg;
1427
1428 ### ligatures have special characters...
1429 $text =~ s/\\013/ff/g;
1430 $text =~ s/\\014/fi/g;
1431 $text =~ s/\\015/fl/g;
1432 $text =~ s/\\016/ffi/g;
1433 $text =~ s/\\214/fi/g;
1434 $text =~ s/\\215/fl/g;
1435 $text =~ s/\\017/\n\* /g; # asterisk?
1436 $text =~ s/\\023/\023/g; # e acute ('e)
1437 $text =~ s/\\177/\252/g; # u"
1438# $text =~ s/ ?? /\344/g; # a"
1439
1440 print OUT "$text";
1441 close OUT;
1442 }
1443 # wrap the text - use a minimum length. ie, first space after this length.
1444 my $wrap_length=72;
1445 &util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1446 open INFILE, "$output_filestem.text.tmp" ||
1447 die "Couldn't open file: $!";
1448 open OUTFILE, ">$output_filestem.text" ||
1449 die "Couldn't open file for writing: $!";
1450 my $line="";
1451 while ($line=<INFILE>) {
1452 while (length($line)>0) {
1453 if (length($line)>$wrap_length) {
1454 $line =~ s/^(.{$wrap_length}[^\s]*)\s*//;
1455 print OUTFILE "$1\n";
1456 } else {
1457 print OUTFILE "$line";
1458 $line="";
1459 }
1460 }
1461 }
1462 close INFILE;
1463 close OUTFILE;
1464 &util::rm("$output_filestem.text.tmp");
1465
1466 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1467 return 1;
1468}
1469
1470
1471# Convert any file to HTML with a crude perl implementation of the
1472# UNIX strings command.
1473
1474sub any_to_html {
1475 my ($input_filename, $output_filestem) = @_;
1476
1477 # First generate a text file
1478 return 0 unless (&any_to_text($input_filename, $output_filestem));
1479
1480 # create an HTML file from the text file
1481 open(TEXT, "<$output_filestem.text");
1482 open(HTML, ">$output_filestem.html");
1483
1484 print HTML "<html><head>\n";
1485 print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1486 print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1487 print HTML "</head><body>\n\n";
1488
1489 my $line;
1490 while ($line=<TEXT>) {
1491 $line =~ s/</&lt;/g;
1492 $line =~ s/>/&gt;/g;
1493 if ($line =~ m/^\s*$/) {
1494 print HTML "<p>";
1495 } else {
1496 print HTML "<br> ", $line;
1497 }
1498 }
1499 print HTML "\n</body></html>\n";
1500
1501 close HTML;
1502 close TEXT;
1503
1504 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1505 return 1;
1506}
1507
1508# Convert any file to TEXT with a crude perl implementation of the
1509# UNIX strings command.
1510# Note - this assumes ascii charsets :( (jrm21)
1511
1512sub any_to_text {
1513 my ($input_filename, $output_filestem) = @_;
1514
1515 if (!$use_strings) {
1516 return 0;
1517 }
1518
1519 print STDERR "\n**** In any to text****\n\n";
1520 open(IN, "<$input_filename") || return 0;
1521 binmode(IN);
1522 open(OUT, ">$output_filestem.text") || return 0;
1523
1524 my ($line);
1525 my $output_line_count = 0;
1526 while (<IN>) {
1527 $line = $_;
1528
1529 # delete anything that isn't a printable character
1530 $line =~ s/[^\040-\176]+/\n/sg;
1531
1532 # delete any string less than 10 characters long
1533 $line =~ s/^.{0,9}$/\n/mg;
1534 while ($line =~ m/^.{1,9}$/m) {
1535 $line =~ s/^.{0,9}$/\n/mg;
1536 $line =~ s/\n+/\n/sg;
1537 }
1538
1539 # remove extraneous whitespace
1540 $line =~ s/\n+/\n/gs;
1541 $line =~ s/^\n//gs;
1542
1543 # output whatever is left
1544 if ($line =~ m/[^\n ]/) {
1545 print OUT $line;
1546 ++$output_line_count;
1547 }
1548 }
1549
1550 close OUT;
1551 close IN;
1552
1553 if ($output_line_count) { # try to protect against binary only formats
1554 return 1;
1555 }
1556
1557 &util::rm("$output_filestem.text");
1558 return 0;
1559
1560}
Note: See TracBrowser for help on using the repository browser.