source: trunk/gsdl/bin/script/gsConvert.pl@ 2031

Last change on this file since 2031 was 2031, checked in by jrm21, 23 years ago

Improved postscript to text handling a little bit better.
Also, system($cmd) return value can be "-1", not just ">0"....

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.2 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML ot TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. These are usually found in the
31# $GSDLHOME/packages directory.
32#
33# Currently, we can convert Microsoft Word and Adobe PDF using specialised
34# conversion utilities. We can convery any file to text with a perl
35# implementation of the UNIX strings command.
36
37
38BEGIN {
39 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
41}
42
43use parsargv;
44use util;
45use Cwd;
46use File::Basename;
47
48
49sub print_usage
50{
51 print STDERR "\n";
52 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
53 print STDERR " or text using third-party programs.\n\n";
54 print STDERR " usage: $0 [options] filename\n";
55 print STDERR " options:\n\t-type\tdoc|pdf\n\t-output\thtml|text\n";
56 print STDERR "\t-timeout\t<max cpu seconds>\n";
57 exit(1);
58}
59
60
61sub main
62{
63 my (@ARGV) = @_;
64 my ($input_type,$output_type,$verbose,$timeout);
65
66 $timeout = 0;
67 # read command-line arguments
68 if (!parsargv::parse(\@ARGV,
69 'type/(doc|pdf)/', \$input_type,
70 'output/(html|text)/', \$output_type,
71 'timeout/\d+/0',\$timeout,
72 'verbose/\d+/0', \$verbose))
73 {
74 print_usage();
75 }
76
77 # Make sure the input file exists and can be opened for reading
78 if (scalar(@ARGV!=1)) {
79 print_usage();
80 }
81
82 my $input_filename = $ARGV[0];
83 if (!-r $input_filename) {
84 print STDERR "Error: unable to open $input_filename for reading\n";
85 exit(1);
86 }
87
88 # Deduce filenames
89 my ($tailname,$dirname,$suffix)
90 = File::Basename::fileparse($input_filename,'\..+');
91 my $output_filestem = &util::filename_cat($dirname,"$tailname");
92
93 if ($input_type eq "")
94 {
95 $input_type = substr($suffix,1,length($suffix)-1);
96 }
97
98 # Change to temporary working directory
99 my $stored_dir = cwd();
100 chdir ($dirname) || die "Unable to change to directory $dirname";
101
102 # Select convert utility
103 if (!defined $input_type) {
104 print STDERR "Error: No filename extension or input type defined\n";
105 exit(1);
106 }
107 elsif ($input_type eq "doc") {
108 print &convertDOC($input_filename, $output_filestem, $output_type);
109 print "\n";
110 }
111 elsif ($input_type eq "rtf") {
112 print &convertRTF($input_filename, $output_filestem, $output_type);
113 print "\n";
114 }
115 elsif ($input_type eq "pdf") {
116 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
117 print "\n";
118 }
119 elsif ($input_type eq "ps") {
120 print &convertPS($input_filename, $output_filestem, $output_type);
121 print "\n";
122 }
123 else {
124 print STDERR "Error: Unable to convert type '$input_type'\n";
125 exit(1);
126 }
127
128 # restore to original working directory
129 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
130
131}
132
133&main(@ARGV);
134
135
136
137# Document-type conversion fucntions
138#
139# The following functions attempt to convert documents from their
140# input type to the specified output type. If no output type was
141# given, then they first attempt HTML, and then TEXT.
142#
143# Each returns the output type ("html" or "text") or "fail" if no
144# conversion is possible.
145
146# Convert a Microsoft word document
147
148sub convertDOC {
149 ($input_filename, $output_filestem, $output_type) = @_;
150
151 # Many .doc files are not in fact word documents!
152 my $realtype = &find_docfile_type($input_filename);
153
154 if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
155 return &convertWord678($input_filename, $output_filestem, $output_type);
156 } elsif ($realtype eq "rtf") {
157 return &convertRTF($input_filename, $output_filestem, $output_type);
158 } else {
159 return &convertAnything($input_filename, $output_filestem, $output_type);
160 }
161}
162
163# Convert a Microsoft word 6/7/8 document
164
165sub convertWord678 {
166 ($input_filename, $output_filestem, $output_type) = @_;
167
168 my $success = 0;
169
170 # Attempt specialised conversion to HTML
171 if (!$output_type || ($output_type =~ /html/i)) {
172 $success = &doc_to_html($input_filename, $output_filestem);
173 if ($success) {
174 return "html";
175 }
176 }
177
178 return &convertAnything($input_filename, $output_filestem, $output_type);
179}
180
181
182# Convert a Rich Text Format (RTF) file
183
184sub convertRTF {
185 ($input_filename, $output_filestem, $output_type) = @_;
186
187 my $success = 0;
188
189 # Attempt specialised conversion to HTML
190 if (!$output_type || ($output_type =~ /html/i)) {
191 $success = &rtf_to_html($input_filename, $output_filestem);
192 if ($success) {
193 return "html";
194 }
195 }
196
197 return &convertAnything($input_filename, $output_filestem, $output_type);
198}
199
200
201# Convert an unidentified file
202
203sub convertAnything {
204 ($input_filename, $output_filestem, $output_type) = @_;
205
206 my $success = 0;
207
208 # Attempt simple conversion to HTML
209 if (!$output_type || ($output_type =~ /html/i)) {
210 $success = &any_to_html($input_filename, $output_filestem);
211 if ($success) {
212 return "html";
213 }
214 }
215
216 # Convert to text
217 if (!$output_type || ($output_type =~ /text/i)) {
218 $success = any_to_text($input_filename, $output_filestem);
219 if ($success) {
220 return "text";
221 }
222 }
223 return "fail";
224}
225
226
227
228# Convert an Adobe PDF document
229
230sub convertPDF {
231 ($dirname, $input_filename, $output_filestem, $output_type) = @_;
232
233 my $success = 0;
234
235 # Attempt conversion to HTML
236 if (!$output_type || ($output_type =~ /html/i)) {
237 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
238 if ($success) {
239 return "html";
240 }
241 }
242
243 # Attempt conversion to TEXT
244 if (!$output_type || ($output_type =~ /text/i)) {
245 $success = &pdf_to_text($input_filename, $output_filestem);
246 if ($success) {
247 return "text";
248 }
249 }
250
251 return "fail";
252
253}
254
255
256# Convert an Adobe PostScript document
257
258sub convertPS {
259 ($input_filename, $output_filestem, $output_type) = @_;
260
261 my $success = 0;
262
263 # Attempt conversion to TEXT
264 if (!$output_type || ($output_type =~ /text/i)) {
265 $success = &ps_to_text($input_filename, $output_filestem);
266 if ($success) {
267 return "text";
268 }
269 }
270
271 return "fail";
272
273}
274
275
276# Find the real type of a .doc file
277#
278# We seem to have a lot of files with a .doc extension that are .rtf
279# files or Word 5 files. This function attempts to tell the difference.
280
281sub find_docfile_type {
282 ($input_filename) = @_;
283
284 open(CHK, "<$input_filename");
285 binmode(CHK);
286 my $line = "";
287 my $first = 1;
288
289 while (<CHK>) {
290
291 $line = $_;
292
293 if ($first) {
294 # check to see if this is an rtf file
295 if ($line =~ /^\{\\rtf/) {
296 close(CHK);
297 return "rtf";
298 }
299 }
300
301 # is this is a word 6/7/8 document?
302 if ($line =~ /Word\.Document\.([678])/) {
303 close(CHK);
304 return "word$1";
305 }
306
307 $first = 0;
308
309 }
310
311 return "unknown";
312}
313
314
315
316# Specific type-to-type conversions
317#
318# Each of the following functions attempts to convert a document from
319# a specific format to another. If they succeed yhey return 1 and leave
320# the output document(s) in the appropriate place; if they fail they
321# return 0 and delete any working files.
322
323
324# Attempt to convert a word document to html with the wv program
325
326sub doc_to_html {
327 ($input_filename, $output_filestem) = @_;
328
329 my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
330 $ENV{'GSDLOS'}, "wvWare");
331 $wvWare .= ".exe" if ($ENV{'GSDLOS'} =~ /^windows$/i);
332 return 0 unless (-e "$wvWare");
333
334 my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "packages",
335 "wv", "wvHtml.xml");
336
337 $cmd = "";
338 if ($timeout) {$cmd = "ulimit -t $timeout;";}
339 $cmd .= "$wvWare --charset utf-8 --config $wv_conf";
340 $cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
341
342 # execute the command
343 if (system($cmd)>0)
344 {
345 print STDERR "Error executing wv converter: $!. Continuing...\n";
346 }
347
348 # Was the conversion successful?
349 if (-e "$output_filestem.html") {
350 open(TMP, "$output_filestem.html");
351 $line = <TMP>;
352 close(TMP);
353 if ($line && $line =~ /DOCTYPE HTML/) {
354 &util::rm("$output_filestem.err");
355 return 1;
356 } else {
357 # An error of some sort occurred
358 &util::rm("$output_filestem.html");
359 &util::rm("$output_filestem.err");
360 }
361 }
362
363 return 0;
364}
365
366
367# Attempt to convert an RTF document to html with rtftohtml
368#
369# rtf2html isn't distributed with Greenstone because it is not
370# distributed under teh GPL. If you know of a better solution,
371# please let me know.
372
373sub rtf_to_html {
374 ($input_filename, $output_filestem) = @_;
375
376 # formulate the command
377 my $r_cmd = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix",
378 "rtf2html", "rtf2html", "rtf2html");
379 $r_cmd = "rtf2html" unless (-e "$r_cmd");
380 return 0 unless (-e "$r_cmd");
381 $cmd = "";
382 if ($timeout) {$cmd = "ulimit -t $timeout;";}
383 $cmd .= "$r_cmd";
384 $cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
385
386 # execute the command
387 if (system($cmd)>0)
388 {
389 print STDERR "Error executing rtf converter: $!. Continuing...\n";
390 }
391
392 # Was the conversion successful?
393 if (-e "$output_filestem.html") {
394 open(TMP, "$output_filestem.html");
395 $line = <TMP>;
396 close(TMP);
397 if ($line && $line =~ /DOCTYPE HTML/) {
398 &util::rm("$output_filestem.err");
399 return 1;
400 } else {
401 # An error of some sort occurred
402 &util::rm("$output_filestem.html");
403 &util::rm("$output_filestem.err");
404 }
405 }
406 return 0;
407}
408
409
410# Convert a pdf file to html with the pdftohtml command
411
412sub pdf_to_html {
413 ($dirname, $input_filename, $output_filestem) = @_;
414
415 $cmd = "";
416 if ($timeout) {$cmd = "ulimit -t $timeout;";}
417 $cmd .= "pdftohtml.pl -F ";
418 $cmd .= " \"$input_filename\" \"$output_filestem\"";
419
420 if (system($cmd)!=0)
421 {
422 print STDERR "Error executing $cmd: $!\n";
423 return 0;
424 }
425
426 # make sure the converter made something
427 if (! -e "$output_filestem.html")
428 {
429 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
430 # print out the converters std err, if any
431 if (-e "$output_filestem.err") {
432 open (ERRLOG, "$output_filestem.err") || die "$!";
433 print STDERR "pdftohtml:\n";
434 while (<ERRLOG>) {
435 print STDERR "$_";
436 }
437 close ERRLOG;
438 }
439 return 0;
440 }
441
442 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
443 return 1;
444}
445
446# Convert a PDF file to text with the pdftotext command
447
448sub pdf_to_text {
449 ($dirname, $input_filename, $output_filestem) = @_;
450
451 $cmd = "pdftotext \"$input_filename\" > \"$output_filestem.text\"";
452 $cmd .= " 2> $output_filestem.err";
453
454 if (system($cmd)>0)
455 {
456 print STDERR "Error executing $cmd: $!\n";
457 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
458 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
459 return 0;
460 }
461
462 # make sure the converter made something
463 if (! -e "$output_filestem.html")
464 {
465 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
466 # print out the converters std err, if any
467 if (-e "$output_filestem.err") {
468 open (ERRLOG, "$output_filestem.err") || die "$!";
469 print STDERR "pdftotext:\n";
470 while (<ERRLOG>) {
471 print STDERR "$_";
472 }
473 close ERRLOG;
474 }
475 return 0;
476 }
477
478 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
479 return 1;
480}
481
482# Convert a PostScript document to text
483# note - just using "ps2ascii" isn't good enough, as it
484# returns 0 for a postscript interpreter error. ps2ascii is just
485# a wrapper to "gs" anyway, so we use that cmd here.
486
487sub ps_to_text {
488 ($input_filename, $output_filestem) = @_;
489
490 my $cmd = "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
491 $cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
492 $cmd .= " 2> $output_filestem.err";
493 $!=0;
494 my $retcode=system($cmd);
495 $retcode = $? >> 8; # see man perlfunc - system for this...
496 # if system returns -1 | 127 (couldn't start program), look at $! for message
497 my $error="";
498 if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
499 elsif (! -e "$output_filestem.text") {
500 $error="did not create output file.\n";
501 }
502 else
503 { # make sure the interpreter didn't get an error. It is technically
504 # possible for the actual text to start with this, but....
505 open PSOUT, "$output_filestem.text";
506 if (<PSOUT> =~ /^Error: (.*)/) {
507 $error="interpreter error - \"$1\"";
508 }
509 close PSOUT;
510 }
511 if ($error ne "")
512 {
513 print STDERR "PSPLUG: WARNING: Error executing gs: $error\n";
514 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
515 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
516
517 # Fine then. We'll just do a lousy job by ourselves...
518 # Based on 5-line regexp sed script found at:
519 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
520 #
521 print STDERR "PSPlug: Stripping text from postscript\n";
522 my $errorcode=0;
523 open (IN, "$input_filename")
524 || ($errorcode=1, warn "Couldn't read file: $!");
525 open (OUT, ">$output_filestem.text")
526 || ($errorcode=1, warn "Couldn't write file: $!");
527 if ($errorcode) {print STDERR "errors\n";return 0;}
528
529 my $text=""; # this is for whole .ps file...
530 while (<IN>) {
531 $text.=$_;
532 }
533 close IN;
534
535 # if ps has Page data, then use it to delete all stuff before it.
536 $text =~ s/^.*?%%Page:.*?\n//s; # treat string as single line
537
538 # remove all leading non-data stuff
539 $text =~ s/^.*?\(//s;
540
541 # remove all newline chars for easier processing
542 $text =~ s/\n//g;
543
544 # Big assumption here - assume that if any co-ordinates are
545 # given, then we are at the end of a sentence.
546 $text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
547
548 # special characters--
549 $text =~ s/\(\|\)/\(\ - \)/g; # j -> em-dash?
550
551 # ? ps text formatting (eg italics?) ?
552 $text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
553 $text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
554 $text =~ s/Fn\(j\)/\(\|\)/g; # j -> |
555 # default - remove the rest
556 $text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
557
558 # attempt to add whitespace between words...
559 # this is based purely on observation, and may be completely wrong...
560 $text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
561 # eg I notice "b(" is sometimes NOT a space if preceded by a
562 # negative number.
563 $text =~ s/\)\d+ ?b\(/\) \( /g;
564
565 # change quoted braces to brackets
566 $text =~ s/([^\\])\\\(/$1\{/g;
567 $text =~ s/([^\\])\\\)/$1\}/g ;
568
569 # remove everything that is not between braces
570 $text =~ s/\)([^\(\)])+?\(//sg ;
571
572 # remove any Trailer eof stuff.
573 $text =~ s/\)[^\)]*$//sg;
574
575 ### ligatures have special characters...
576 $text =~ s/\\013/ff/g;
577 $text =~ s/\\014/fi/g;
578 $text =~ s/\\015/fl/g;
579 $text =~ s/\\016/ffi/g;
580 $text =~ s/\\214/fi/g;
581 $text =~ s/\\215/fl/g;
582 $text =~ s/\\017/\n\* /g; # asterisk?
583 $text =~ s/\\023/\023/g; # e acute ('e)
584 $text =~ s/\\177/\252/g; # u"
585# $text =~ s/ ?? /\344/g; # a"
586
587 print OUT "$text";
588 close OUT;
589 }
590 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
591 return 1;
592}
593
594
595# Convert any file to HTML with a crude perl implementation of the
596# UNIX strings command.
597
598sub any_to_html {
599 ($input_filename, $output_filestem) = @_;
600
601 # First generate a text file
602 return 0 unless (&any_to_text($input_filename, $output_filestem));
603
604 # create an HTML file from the text file
605 open(TEXT, "<$output_filestem.text");
606 open(HTML, ">$output_filestem.html");
607
608 print HTML '<html><head>
609<META HTTP-EQUIV="Content-Type" CONTENT="text/html">
610<META NAME="GENERATOR" CONTENT="Greenstone any_to_html">
611</head><body>';
612 print HTML "\n\n";
613
614 while (<TEXT>) {
615 print HTML "<p> ", $_;
616
617 }
618 print HTML "\n</body></html>\n";
619
620 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
621 return 1;
622}
623
624# Convert any file to TEXT with a crude perl implementation of the
625# UNIX strings command.
626
627sub any_to_text {
628 ($input_filename, $output_filestem) = @_;
629
630 open(IN, "<$input_filename");
631 binmode(IN);
632 open(OUT, ">$output_filestem.text");
633
634 my ($line);
635 my $dgcount = 0;
636 while (<IN>) {
637 $line = $_;
638
639 # delete anything that isn't a printable character
640 $line =~ s/[^\040-\176]+/\n/sg;
641
642 # delete any string less than 10 characters long
643 $line =~ s/^.{0,9}$/\n/mg;
644 while ($line =~ /^.{1,9}$/m) {
645 $line =~ s/^.{0,9}$/\n/mg;
646 $line =~ s/\n+/\n/sg;
647 }
648
649 # remove extraneous whitespace
650 $line =~ s/\n+/\n/gs;
651 $line =~ s/^\n//gs;
652
653 # output whatever is left
654 if ($line =~ /[^\n ]/) {
655 print OUT $line;
656 }
657 }
658 return 1;
659}
Note: See TracBrowser for help on using the repository browser.