source: trunk/gsdl/bin/script/gsConvert.pl@ 1928

Last change on this file since 1928 was 1928, checked in by sjboddie, 23 years ago

Added: pdftohtml.pl - Perl script that handles conversion of PDF documents into

HTML. Called by gsConvert.pl in sub pdf_to_html.

Modified: gsConvert.pl - Perl script that converts various formats (MSWord,

RTF, PDF, PS) into HTML when importing the collection.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.7 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML ot TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. These are usually found in the
31# $GSDLHOME/packages directory.
32#
33# Currently, we can convert Microsoft Word and Adobe PDF using specialised
34# conversion utilities. We can convery any file to text with a perl
35# implementation of the UNIX strings command.
36
37
38BEGIN {
39 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
41}
42
43use parsargv;
44use util;
45use Cwd;
46use File::Basename;
47
48
49sub print_usage
50{
51 print STDERR "Usage: $0 [options] filename\n";
52 print STDERR "Options are:\n\t-type\tdoc|pdf\n\t-output\thtml|text\n";
53 print STDERR "\t-timeout\t<max cpu seconds>\n";
54 exit(1);
55}
56
57
58sub main
59{
60 my (@ARGV) = @_;
61 my ($input_type,$output_type,$verbose,$timeout);
62
63 $timeout = 0;
64 # read command-line arguments
65 if (!parsargv::parse(\@ARGV,
66 'type/(doc|pdf)/', \$input_type,
67 'output/(html|text)/', \$output_type,
68 'timeout/\d+/0',\$timeout,
69 'verbose/\d+/0', \$verbose))
70 {
71 print_usage();
72 }
73
74 # Make sure the input file exists and can be opened for reading
75 if (scalar(@ARGV!=1)) {
76 print_usage();
77 }
78
79 my $input_filename = $ARGV[0];
80 if (!-r $input_filename) {
81 print STDERR "Error: unable to open $input_filename for reading\n";
82 exit(1);
83 }
84
85 # Deduce filenames
86 my ($tailname,$dirname,$suffix)
87 = File::Basename::fileparse($input_filename,'\..+');
88 my $output_filestem = &util::filename_cat($dirname,"$tailname");
89
90 if ($input_type eq "")
91 {
92 $input_type = substr($suffix,1,length($suffix)-1);
93 }
94
95 # Change to temporary working directory
96 my $stored_dir = cwd();
97 chdir ($dirname) || die "Unable to change to directory $dirname";
98
99 # Select convert utility
100 if (!defined $input_type) {
101 print STDERR "Error: No filename extension or input type defined\n";
102 exit(1);
103 }
104 elsif ($input_type eq "doc") {
105 print &convertDOC($input_filename, $output_filestem, $output_type);
106 print "\n";
107 }
108 elsif ($input_type eq "rtf") {
109 print &convertRTF($input_filename, $output_filestem, $output_type);
110 print "\n";
111 }
112 elsif ($input_type eq "pdf") {
113 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
114 print "\n";
115 }
116 elsif ($input_type eq "ps") {
117 print &convertPS($input_filename, $output_filestem, $output_type);
118 print "\n";
119 }
120 else {
121 print STDERR "Error: Unable to convert type '$input_type'\n";
122 exit(1);
123 }
124
125 # restore to original working directory
126 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
127
128}
129
130&main(@ARGV);
131
132
133
134# Document-type conversion fucntions
135#
136# The following functions attempt to convert documents from their
137# input type to the specified output type. If no output type was
138# given, then they first attempt HTML, and then TEXT.
139#
140# Each returns the output type ("html" or "text") or "fail" if no
141# conversion is possible.
142
143# Convert a Microsoft word document
144
145sub convertDOC {
146 ($input_filename, $output_filestem, $output_type) = @_;
147
148 # Many .doc files are not in fact word documents!
149 my $realtype = &find_docfile_type($input_filename);
150
151 if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
152 return &convertWord678($input_filename, $output_filestem, $output_type);
153 } elsif ($realtype eq "rtf") {
154 return &convertRTF($input_filename, $output_filestem, $output_type);
155 } else {
156 return &convertAnything($input_filename, $output_filestem, $output_type);
157 }
158}
159
160# Convert a Microsoft word 6/7/8 document
161
162sub convertWord678 {
163 ($input_filename, $output_filestem, $output_type) = @_;
164
165 my $success = 0;
166
167 # Attempt specialised conversion to HTML
168 if (!$output_type || ($output_type =~ /html/i)) {
169 print STDERR "I am about to call doc_to_html...\n";
170 $success = &doc_to_html($input_filename, $output_filestem);
171 if ($success) {
172 return "html";
173 }
174 }
175
176 return &convertAnything($input_filename, $output_filestem, $output_type);
177}
178
179
180# Convert a Rich Text Format (RTF) file
181
182sub convertRTF {
183 ($input_filename, $output_filestem, $output_type) = @_;
184
185 my $success = 0;
186
187 # Attempt specialised conversion to HTML
188 if (!$output_type || ($output_type =~ /html/i)) {
189 $success = &rtf_to_html($input_filename, $output_filestem);
190 if ($success) {
191 return "html";
192 }
193 }
194
195 return &convertAnything($input_filename, $output_filestem, $output_type);
196}
197
198
199# Convert an unidentified file
200
201sub convertAnything {
202 ($input_filename, $output_filestem, $output_type) = @_;
203
204 my $success = 0;
205
206 # Attempt simple conversion to HTML
207 if (!$output_type || ($output_type =~ /html/i)) {
208 $success = &any_to_html($input_filename, $output_filestem);
209 if ($success) {
210 return "html";
211 }
212 }
213
214 # Convert to text
215 if (!$output_type || ($output_type =~ /text/i)) {
216 $success = any_to_text($input_filename, $output_filestem);
217 if ($success) {
218 return "text";
219 }
220 }
221 return "fail";
222}
223
224
225
226# Convert an Adobe PDF document
227
228sub convertPDF {
229 ($dirname, $input_filename, $output_filestem, $output_type) = @_;
230
231 my $success = 0;
232
233 # Attempt conversion to HTML
234 if (!$output_type || ($output_type =~ /html/i)) {
235 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
236 if ($success) {
237 return "html";
238 }
239 }
240
241 # Attempt conversion to TEXT
242 if (!$output_type || ($output_type =~ /text/i)) {
243 $success = &pdf_to_text($input_filename, $output_filestem);
244 if ($success) {
245 return "text";
246 }
247 }
248
249 return "fail";
250
251}
252
253
254# Convert an Adobe PostScript document
255
256sub convertPS {
257 ($input_filename, $output_filestem, $output_type) = @_;
258
259 my $success = 0;
260
261 # Attempt conversion to TEXT
262 if (!$output_type || ($output_type =~ /text/i)) {
263 $success = &ps_to_text($input_filename, $output_filestem);
264 if ($success) {
265 return "text";
266 }
267 }
268
269 return "fail";
270
271}
272
273
274# Find the real type of a .doc file
275#
276# We seem to have alot of files with a .dco extension that are .rtf
277# files or Word 5 files. This function attempts to tell the difference.
278
279sub find_docfile_type {
280 ($input_filename) = @_;
281
282 open(TMP, ">temp.txt");
283 binmode(TMP);
284 open(CHK, "<$input_filename");
285 binmode(CHK);
286 my $line = "";
287 my $first = 1;
288
289 while (<CHK>) {
290
291 $line = $_;
292 print TMP "$line\n\n";
293 if ($first) {
294 # check to see if this is an rtf file
295 if ($line =~ /^\{\\rtf/) {
296 close(CHK);
297 return "rtf";
298 }
299 }
300
301 # is this is a word 6/7/8 document?
302 if ($line =~ /Word\.Document\.([678])/) {
303 close(CHK);
304 return "word$1";
305 }
306
307 $first = 0;
308
309 }
310
311 return "unknown";
312}
313
314
315
316# Specific type-to-type conversions
317#
318# Each of the following functions attempts to convert a document from
319# a specific format to another. If they succeed yhey return 1 and leave
320# the output document(s) in the appropriate place; if they fail they
321# return 0 and delete any working files.
322
323
324# Attempt to convert a word document to html with the wv program
325
326sub doc_to_html {
327 ($input_filename, $output_filestem) = @_;
328
329 my $wvWare = "";
330 my $wv_conf = "";
331
332 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
333 $wvWare = "$ENV{'GSDLHOME'}\\bin\\windows\\wvWare.exe";
334 $wv_conf = "$ENV{'GSDLHOME'}\\bin\\windows\\wvHtml.xml";
335
336 } else {
337 # formulate the command
338 my $wv_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", "wv");
339 $wv_conf = &util::filename_cat($wv_home, "lib", "wv", "wvHtml.xml");
340 $wvWare = &util::filename_cat($wv_home, "bin", "wvWare");
341 }
342 return 0 unless (-e "$wvWare");
343
344 $cmd = "";
345 if ($timeout) {$cmd = "ulimit -t $timeout;";}
346 $cmd .= "$wvWare --charset utf-8 --config $wv_conf";
347 $cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
348
349 # execute the command
350 if (system($cmd)>0)
351 {
352 print STDERR "Error executing wv converter: $!. Continuing...\n";
353 }
354
355 # Was the conversion successful?
356 if (-e "$output_filestem.html") {
357 open(TMP, "$output_filestem.html");
358 $line = <TMP>;
359 close(TMP);
360 if ($line && $line =~ /DOCTYPE HTML/) {
361 &util::rm("$output_filestem.err");
362 return 1;
363 } else {
364 # An error of some sort occurred
365 &util::rm("$output_filestem.html");
366 &util::rm("$output_filestem.err");
367 }
368 }
369
370 return 0;
371}
372
373
374# Attempt to convert an RTF document to html with rtftohtml
375#
376# rtf2html isn't distributed with Greenstone because it is not
377# distributed under teh GPL. If you know of a better solution,
378# please let me know.
379
380sub rtf_to_html {
381 ($input_filename, $output_filestem) = @_;
382
383 # formulate the command
384 my $r_cmd = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix",
385 "rtf2html", "rtf2html", "rtf2html");
386 $r_cmd = "rtf2html" unless (-e "$r_cmd");
387 return 0 unless (-e "$r_cmd");
388 $cmd = "";
389 if ($timeout) {$cmd = "ulimit -t $timeout;";}
390 $cmd .= "$r_cmd";
391 $cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
392
393 # execute the command
394 if (system($cmd)>0)
395 {
396 print STDERR "Error executing rtf converter: $!. Continuing...\n";
397 }
398
399 # Was the conversion successful?
400 if (-e "$output_filestem.html") {
401 open(TMP, "$output_filestem.html");
402 $line = <TMP>;
403 close(TMP);
404 if ($line && $line =~ /DOCTYPE HTML/) {
405 &util::rm("$output_filestem.err");
406 return 1;
407 } else {
408 # An error of some sort occurred
409 &util::rm("$output_filestem.html");
410 &util::rm("$output_filestem.err");
411 }
412 }
413 return 0;
414}
415
416
417# Convert a pdf file to html with the pdftohtml command
418
419sub pdf_to_html {
420 ($dirname, $input_filename, $output_filestem) = @_;
421
422 $cmd = "";
423 if ($timeout) {$cmd = "ulimit -t $timeout;";}
424 $cmd .= "pdftohtml.pl -F ";
425 $cmd .= " \"$input_filename\" \"$output_filestem\"";
426
427 if (system($cmd)>0)
428 {
429 print STDERR "Error executing $cmd: $!\n";
430 return 0;
431 }
432
433 # make sure the converter made something
434 if (! -e "$output_filestem.html")
435 {
436 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
437 # print out the converters std err, if any
438 if (-e "$output_filestem.err") {
439 open (ERRLOG, "$output_filestem.err") || die "$!";
440 print STDERR "pdftohtml:\n";
441 while (<ERRLOG>) {
442 print STDERR "$_";
443 }
444 close ERRLOG;
445 }
446 return 0;
447 }
448
449 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
450 return 1;
451}
452
453# Convert a PDF file to text with the pdftotext command
454
455sub pdf_to_text {
456 ($dirname, $input_filename, $output_filestem) = @_;
457
458 $cmd = "pdftotext \"$input_filename\" > \"$output_filestem.text\"";
459 $cmd .= " 2> $output_filestem.err";
460
461 if (system($cmd)>0)
462 {
463 print STDERR "Error executing $cmd: $!\n";
464 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
465 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
466 return 0;
467 }
468
469 # make sure the converter made something
470 if (! -e "$output_filestem.html")
471 {
472 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
473 # print out the converters std err, if any
474 if (-e "$output_filestem.err") {
475 open (ERRLOG, "$output_filestem.err") || die "$!";
476 print STDERR "pdftotext:\n";
477 while (<ERRLOG>) {
478 print STDERR "$_";
479 }
480 close ERRLOG;
481 }
482 return 0;
483 }
484
485 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
486 return 1;
487}
488
489# Convert a PostScript document to text with ps2ascii
490
491sub ps_to_text {
492 ($input_filename, $output_filestem) = @_;
493
494 my $cmd = "ps2ascii \"$input_filename\" > \"$output_filestem.text\"";
495 $cmd .= " 2> $output_filestem.err";
496 if (system($cmd)>0)
497 {
498 print STDERR "Error executing $cmd: $!\n";
499 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
500 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
501
502 # Fine then. We'll just do a lousy job by ourselves...
503 # Based on code nicked from:
504 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
505 #
506 print STDERR "Attempting to strip text from postscript.\n";
507 my $errorcode=0;
508 open (IN, "$input_filename")
509 || ($errorcode=1, warn "Couldn't read file: $!");
510 open (OUT, ">$output_filestem.text")
511 || ($errorcode=1, warn "Couldn't write file: $!");
512 if ($errorcode) {print STDERR "errors\n";return 0;}
513
514 my $in_a_sentence=0;
515 while (<IN>) {
516 if (/^[^\(\)]+$/ && !$in_a_sentence) {next ;} # no brackets in line
517 # attempt to add whitespace between different lines...
518 s/F.?\(/\( /g; # this might break up some other words though...
519 ### remove all postscript control data
520 if (!$in_a_sentence) {
521 s/^[^\(\)]*?\(//;} # rm start of line up to first open bracket
522 s/\\\(/\{/g;s/\\\)/\}/g ; # change quoted braces
523 s/\)([^\(\)])*?\(//g ; # close bracket up to next open unquoted bracket
524 if (s/\)[^\(\)]*?$//g) # last close bracket to end of line
525 {$in_a_sentence=0;chomp;}
526 if (s/\\$//) # if line is a continuation
527 {$in_a_sentence=1;chomp;}
528 s/^$//g ; # remove empty lines
529 ### ligatures have special characters...
530 s/\\214/fi/g;
531 s/\\215/fl/g;
532 print OUT "$_";
533 }
534 close IN; close OUT;
535 }
536 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
537 return 1;
538}
539
540
541# Convert any file to HTML with a crude perl implementation of the
542# UNIX strings command.
543
544sub any_to_html {
545 ($input_filename, $output_filestem) = @_;
546
547 # First generate a text file
548 return 0 unless (&any_to_text($input_filename, $output_filestem));
549
550 # create an HTML file from the text file
551 open(TEXT, "<$output_filestem.text");
552 open(HTML, ">$output_filestem.html");
553
554 print HTML '<html><head>
555<META HTTP-EQUIV="Content-Type" CONTENT="text/html">
556<META NAME="GENERATOR" CONTENT="Greenstone any_to_html">
557</head><body>';
558 print HTML "\n\n";
559
560 while (<TEXT>) {
561 print HTML "<p> ", $_;
562
563 }
564 print HTML "\n</body></html>\n";
565
566 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
567 return 1;
568}
569
570# Convert any file to TEXT with a crude perl implementation of the
571# UNIX strings command.
572
573sub any_to_text {
574 ($input_filename, $output_filestem) = @_;
575
576 #open(TEMP, ">temp.txt");
577 open(IN, "<$input_filename");
578 binmode(IN);
579 open(OUT, ">$output_filestem.text");
580
581 my ($line);
582 my $dgcount = 0;
583 while (<IN>) {
584 $line = $_;
585
586 # delete anything that isn't a printable character
587 #print TEMP $line;
588 $line =~ s/[^\040-\176]+/\n/sg;
589
590 # delete any string less than 10 characters long
591 $line =~ s/^.{0,9}$/\n/mg;
592 while ($line =~ /^.{1,9}$/m) {
593 $line =~ s/^.{0,9}$/\n/mg;
594 $line =~ s/\n+/\n/sg;
595 }
596
597 # remove extraneous whitespace
598 $line =~ s/\n+/\n/gs;
599 $line =~ s/^\n//gs;
600
601 # output whatever is left
602 if ($line =~ /[^\n ]/) {
603 print OUT $line;
604 }
605 }
606 return 1;
607}
Note: See TracBrowser for help on using the repository browser.