source: trunk/gsdl/bin/script/gsConvert.pl@ 1734

Last change on this file since 1734 was 1734, checked in by jrm21, 24 years ago

For postscript, fall back to some simple text extraction if ps2ascii isn't
found. (This should be portable as it is perl). It won't be formatted though,
so currently is only useful for indexing - users will have to view the
postscript for now...

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.7 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML ot TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. These are usually found in the
31# $GSDLHOME/packages directory.
32#
33# Currently, we can convert Microsoft Word and Adobe PDF using specialised
34# conversion utilities. We can convery any file to text with a perl
35# implementation of the UNIX strings command.
36
37
38BEGIN {
39 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
41}
42
43use parsargv;
44use util;
45use Cwd;
46use File::Basename;
47
48
49sub print_usage
50{
51 print STDERR "Usage: $0 [options] filename\n";
52 print STDERR "Options are:\n\t-type\tdoc|pdf\n\t-output\thtml|text\n";
53 print STDERR "\t-timeout\t<max cpu seconds>\n";
54 exit(1);
55}
56
57
58sub main
59{
60 my (@ARGV) = @_;
61 my ($input_type,$output_type,$verbose,$timeout);
62
63 $timeout = 0;
64 # read command-line arguments
65 if (!parsargv::parse(\@ARGV,
66 'type/(doc|pdf)/', \$input_type,
67 'output/(html|text)/', \$output_type,
68 'timeout/\d+/0',\$timeout,
69 'verbose/\d+/0', \$verbose))
70 {
71 print_usage();
72 }
73
74 # Make sure the input file exists and can be opened for reading
75 if (scalar(@ARGV!=1)) {
76 print_usage();
77 }
78 my $input_filename = $ARGV[0];
79 if (!-r $input_filename) {
80 print STDERR "Error: unable to open $input_filename for reading\n";
81 exit(1);
82 }
83
84 # Deduce filenames
85 my ($tailname,$dirname,$suffix)
86 = File::Basename::fileparse($input_filename,'\..+');
87 my $output_filestem = &util::filename_cat($dirname,"$tailname");
88
89 if ($input_type eq "")
90 {
91 $input_type = substr($suffix,1,length($suffix)-1);
92 }
93
94 # Change to temporary working directory
95 my $stored_dir = cwd();
96 chdir ($dirname) || die "Unable to change to directory $dirname";
97
98 # Select convert utility
99 if (!defined $input_type) {
100 print STDERR "Error: No filename extension or input type defined\n";
101 exit(1);
102 }
103 elsif ($input_type eq "doc") {
104 print STDERR "I recognise this to be a Word document...\n"; # remove
105 print &convertDOC($input_filename, $output_filestem, $output_type);
106 print "\n";
107 }
108 elsif ($input_type eq "rtf") {
109 print &convertRTF($input_filename, $output_filestem, $output_type);
110 print "\n";
111 }
112 elsif ($input_type eq "pdf") {
113 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
114 print "\n";
115 }
116 elsif ($input_type eq "ps") {
117 print &convertPS($input_filename, $output_filestem, $output_type);
118 print "\n";
119 }
120 else {
121 print STDERR "Error: Unable to convert type '$input_type'\n";
122 exit(1);
123 }
124
125 # restore to original working directory
126 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
127
128}
129
130&main(@ARGV);
131
132
133
134# Document-type conversion fucntions
135#
136# The following functions attempt to convert documents from their
137# input type to the specified output type. If no output type was
138# given, then they first attempt HTML, and then TEXT.
139#
140# Each returns the output type ("html" or "text") or "fail" if no
141# conversion is possible.
142
143# Convert a Microsoft word document
144
145sub convertDOC {
146 ($input_filename, $output_filestem, $output_type) = @_;
147
148 # Many .doc files are not in fact word documents!
149 my $realtype = &find_docfile_type($input_filename);
150
151 print STDERR "The real type of this Word document is $realtype\n"; # remove
152
153 if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
154 print STDERR "I recognise this to be a word678 document...\n"; # remove
155 return &convertWord678($input_filename, $output_filestem, $output_type);
156 } elsif ($realtype eq "rtf") {
157 return &convertRTF($input_filename, $output_filestem, $output_type);
158 } else {
159 return &convertAnything($input_filename, $output_filestem, $output_type);
160 }
161}
162
163# Convert a Microsoft word 6/7/8 document
164
165sub convertWord678 {
166 ($input_filename, $output_filestem, $output_type) = @_;
167
168 my $success = 0;
169
170 # Attempt specialised conversion to HTML
171 if (!$output_type || ($output_type =~ /html/i)) {
172 print STDERR "I am about to call doc_to_html...\n";
173 $success = &doc_to_html($input_filename, $output_filestem);
174 if ($success) {
175 return "html";
176 }
177 }
178
179 return &convertAnything($input_filename, $output_filestem, $output_type);
180}
181
182
183# Convert a Rich Text Format (RTF) file
184
185sub convertRTF {
186 ($input_filename, $output_filestem, $output_type) = @_;
187
188 my $success = 0;
189
190 # Attempt specialised conversion to HTML
191 if (!$output_type || ($output_type =~ /html/i)) {
192 $success = &rtf_to_html($input_filename, $output_filestem);
193 if ($success) {
194 return "html";
195 }
196 }
197
198 return &convertAnything($input_filename, $output_filestem, $output_type);
199}
200
201
202# Convert an unidentified file
203
204sub convertAnything {
205 ($input_filename, $output_filestem, $output_type) = @_;
206
207 my $success = 0;
208
209 # Attempt simple conversion to HTML
210 if (!$output_type || ($output_type =~ /html/i)) {
211 $success = &any_to_html($input_filename, $output_filestem);
212 if ($success) {
213 return "html";
214 }
215 }
216
217 # Convert to text
218 if (!$output_type || ($output_type =~ /text/i)) {
219 $success = any_to_text($input_filename, $output_filestem);
220 if ($success) {
221 return "text";
222 }
223 }
224 return "fail";
225}
226
227
228
229# Convert an Adobe PDF document
230
231sub convertPDF {
232 ($dirname, $input_filename, $output_filestem, $output_type) = @_;
233
234 my $success = 0;
235
236 # Attempt conversion to HTML
237 if (!$output_type || ($output_type =~ /html/i)) {
238 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
239 if ($success) {
240 return "html";
241 }
242 }
243
244 # Attempt conversion to TEXT
245 if (!$output_type || ($output_type =~ /text/i)) {
246 $success = &pdf_to_text($input_filename, $output_filestem);
247 if ($success) {
248 return "text";
249 }
250 }
251
252 return "fail";
253
254}
255
256
257# Convert an Adobe PostScript document
258
259sub convertPS {
260 ($input_filename, $output_filestem, $output_type) = @_;
261
262 my $success = 0;
263
264 # Attempt conversion to TEXT
265 if (!$output_type || ($output_type =~ /text/i)) {
266 $success = &ps_to_text($input_filename, $output_filestem);
267 if ($success) {
268 return "text";
269 }
270 }
271
272 return "fail";
273
274}
275
276
277# Find the real type of a .doc file
278#
279# We seem to have alot of files with a .dco extension that are .rtf
280# files or Word 5 files. This function attempts to tell the difference.
281
282sub find_docfile_type {
283 ($input_filename) = @_;
284
285 open(TMP, ">temp.txt");
286 binmode(TMP);
287 open(CHK, "<$input_filename");
288 binmode(CHK);
289 my $line = "";
290 my $first = 1;
291
292 while (<CHK>) {
293
294 $line = $_;
295 print TMP "$line\n\n";
296 if ($first) {
297 # check to see if this is an rtf file
298 if ($line =~ /^\{\\rtf/) {
299 close(CHK);
300 return "rtf";
301 }
302 }
303
304 # is this is a word 6/7/8 document?
305 if ($line =~ /Word\.Document\.([678])/) {
306 close(CHK);
307 return "word$1";
308 }
309
310 $first = 0;
311
312 }
313
314 return "unknown";
315}
316
317
318
319# Specific type-to-type conversions
320#
321# Each of the following functions attempts to convert a document from
322# a specific format to another. If they succeed yhey return 1 and leave
323# the output document(s) in the appropriate place; if they fail they
324# return 0 and delete any working files.
325
326
327# Attempt to convert a word document to html with the wv program
328
329sub doc_to_html {
330 print STDERR "/;-DG I am in doc_to_html...\n"; # remove
331 ($input_filename, $output_filestem) = @_;
332
333 my $wvWare = "";
334 my $wv_conf = "";
335 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
336 $wvWare = "$ENV{'GSDLHOME'}\\bin\\windows\\wvWare.exe";
337 $wv_conf = "$ENV{'GSDLHOME'}\\bin\\windows\\wvHtml.xml";
338
339 } else {
340 # formulate the command
341 my $wv_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", "wv");
342 $wv_conf = &util::filename_cat($wv_home, "lib", "wv", "wvHtml.xml");
343 $wvWare = &util::filename_cat($wv_home, "bin", "wvWare");
344 }
345 print STDERR "I am about to test if your file exists...\n";
346 return 0 unless (-e "$wvWare");
347 $cmd = "";
348 if ($timeout) {$cmd = "ulimit -t $timeout;";}
349 $cmd .= "$wvWare --charset utf-8 --config $wv_conf";
350 $cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
351
352 print STDERR "$cmd\n"; #remove
353
354 # execute the command
355 print STDERR system($cmd);
356 print STDERR "\n";
357 if (system($cmd)>0)
358 {
359 print STDERR "Error executing wv converter: $!. Continuing...\n";
360 }
361
362 # Was the conversion successful?
363 if (-e "$output_filestem.html") {
364 open(TMP, "$output_filestem.html");
365 $line = <TMP>;
366 close(TMP);
367 if ($line && $line =~ /DOCTYPE HTML/) {
368 &util::rm("$output_filestem.err");
369 return 1;
370 } else {
371 # An error of some sort occurred
372 &util::rm("$output_filestem.html");
373 &util::rm("$output_filestem.err");
374 }
375 }
376 print STDERR "/;-DG I am leaving doc_to_html...\n";
377 return 0;
378}
379
380
381# Attempt to convert an RTF document to html with rtftohtml
382#
383# rtf2html isn't distributed with Greenstone because it is not
384# distributed under teh GPL. If you know of a better solution,
385# please let me know.
386
387sub rtf_to_html {
388 ($input_filename, $output_filestem) = @_;
389
390 # formulate the command
391 my $r_cmd = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix",
392 "rtf2html", "rtf2html", "rtf2html");
393 $r_cmd = "rtf2html" unless (-e "$r_cmd");
394 return 0 unless (-e "$r_cmd");
395 $cmd = "";
396 if ($timeout) {$cmd = "ulimit -t $timeout;";}
397 $cmd .= "$r_cmd";
398 $cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
399
400 # execute the command
401 if (system($cmd)>0)
402 {
403 print STDERR "Error executing rtf converter: $!. Continuing...\n";
404 }
405
406 # Was the conversion successful?
407 if (-e "$output_filestem.html") {
408 open(TMP, "$output_filestem.html");
409 $line = <TMP>;
410 close(TMP);
411 if ($line && $line =~ /DOCTYPE HTML/) {
412 &util::rm("$output_filestem.err");
413 return 1;
414 } else {
415 # An error of some sort occurred
416 &util::rm("$output_filestem.html");
417 &util::rm("$output_filestem.err");
418 }
419 }
420 return 0;
421}
422
423
424# Convert a pdf file to html with the pdftohtml command
425
426sub pdf_to_html {
427 ($dirname, $input_filename, $output_filestem) = @_;
428
429 # formulate the command
430 my $p_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", "pdftohtml");
431 my $pdftohtml = &util::filename_cat($p_home, "pdftohtml_0_20", "pdftohtml.bin");
432 return 0 unless (-e "$pdftohtml");
433
434 $cmd = "";
435 if ($timeout) {$cmd = "ulimit -t $timeout;";}
436 $cmd .= "$pdftohtml -noframes";
437 $cmd .= " \"$input_filename\" \"$output_filestem.html\"";
438 $cmd .= " >\"$output_filestem.out\" 2>\"$output_filestem.err\"";
439
440 if (system($cmd)>0)
441 {
442 print STDERR "Error executing $cmd: $!\n";
443 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
444 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
445 return 0;
446 }
447
448 # make sure the converter made something
449 if (! -e "$output_filestem.html")
450 {
451 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
452 # print out the converters std err, if any
453 if (-e "$output_filestem.err") {
454 open (ERRLOG, "$output_filestem.err") || die "$!";
455 print STDERR "pdftohtml:\n";
456 while (<ERRLOG>) {
457 print STDERR "$_";
458 }
459 close ERRLOG;
460 }
461 return 0;
462 }
463
464 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
465 return 1;
466}
467
468
469# Convert a PDF file to text with the pdftotext command
470
471sub pdf_to_text {
472 ($dirname, $input_filename, $output_filestem) = @_;
473
474 $cmd = "pdftotext \"$input_filename\" > \"$output_filestem.text\"";
475 $cmd .= " 2> $output_filestem.err";
476
477 if (system($cmd)>0)
478 {
479 print STDERR "Error executing $cmd: $!\n";
480 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
481 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
482 return 0;
483 }
484
485 # make sure the converter made something
486 if (! -e "$output_filestem.html")
487 {
488 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
489 # print out the converters std err, if any
490 if (-e "$output_filestem.err") {
491 open (ERRLOG, "$output_filestem.err") || die "$!";
492 print STDERR "pdftotext:\n";
493 while (<ERRLOG>) {
494 print STDERR "$_";
495 }
496 close ERRLOG;
497 }
498 return 0;
499 }
500
501 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
502 return 1;
503}
504
505
506# Convert a PostScript document to text with ps2ascii
507
508sub ps_to_text {
509 ($input_filename, $output_filestem) = @_;
510
511 my $cmd = "ps2ascii \"$input_filename\" > \"$output_filestem.text\"";
512 $cmd .= " 2> $output_filestem.err";
513 if (system($cmd)>0)
514 {
515 print STDERR "Error executing $cmd: $!\n";
516 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
517 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
518
519 # Fine then. We'll just do a lousy job by ourselves...
520 # Based on code nicked from:
521 # http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
522 #
523 print STDERR "Attempting to strip text from postscript.\n";
524 my $errorcode=0;
525 open (IN, "$input_filename")
526 || ($errorcode=1, warn "Couldn't read file: $!");
527 open (OUT, ">$output_filestem.text")
528 || ($errorcode=1, warn "Couldn't write file: $!");
529 if ($errorcode) {print STDERR "errors\n";return 0;}
530
531 my $in_a_sentence=0;
532 while (<IN>) {
533 if (/^[^\(\)]+$/ && !$in_a_sentence) {next ;} # no brackets in line
534 # attempt to add whitespace between different lines...
535 s/F.?\(/\( /g; # this might break up some other words though...
536 ### remove all postscript control data
537 if (!$in_a_sentence) {
538 s/^[^\(\)]*?\(//;} # rm start of line up to first open bracket
539 s/\\\(/\{/g;s/\\\)/\}/g ; # change quoted braces
540 s/\)([^\(\)])*?\(//g ; # close bracket up to next open unquoted bracket
541 if (s/\)[^\(\)]*?$//g) # last close bracket to end of line
542 {$in_a_sentence=0;chomp;}
543 if (s/\\$//) # if line is a continuation
544 {$in_a_sentence=1;chomp;}
545 s/^$//g ; # remove empty lines
546 ### ligatures have special characters...
547 s/\\214/fi/g;
548 s/\\215/fl/g;
549 print OUT "$_";
550 }
551 close IN; close OUT;
552 }
553 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
554 return 1;
555}
556
557
558# Convert any file to HTML with a crude perl implementation of the
559# UNIX strings command.
560
561sub any_to_html {
562 print STDERR "/;-Dg I am in any_to_html!\n";
563 ($input_filename, $output_filestem) = @_;
564
565 # First generate a text file
566 return 0 unless (&any_to_text($input_filename, $output_filestem));
567
568 # create an HTML file from the text file
569 open(TEXT, "<$output_filestem.text");
570 open(HTML, ">$output_filestem.html");
571
572 print HTML '<html><head>
573<META HTTP-EQUIV="Content-Type" CONTENT="text/html">
574<META NAME="GENERATOR" CONTENT="Greenstone any_to_html">
575</head><body>';
576 print HTML "\n\n";
577
578 while (<TEXT>) {
579 print HTML "<p> ", $_;
580
581 }
582 print HTML "\n</body></html>\n";
583
584 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
585 print STDERR "/;-Dg I am getting out of any_to_html!\n";
586 return 1;
587}
588
589# Convert any file to TEXT with a crude perl implementation of the
590# UNIX strings command.
591
592sub any_to_text {
593 ($input_filename, $output_filestem) = @_;
594
595 #open(TEMP, ">temp.txt");
596 open(IN, "<$input_filename");
597 binmode(IN);
598 open(OUT, ">$output_filestem.text");
599
600 my ($line);
601 my $dgcount = 0;
602 while (<IN>) {
603 $line = $_;
604
605 # delete anything that isn't a printable character
606 #print TEMP $line;
607 $line =~ s/[^\040-\176]+/\n/sg;
608
609 # delete any string less than 10 characters long
610 $line =~ s/^.{0,9}$/\n/mg;
611 while ($line =~ /^.{1,9}$/m) {
612 $line =~ s/^.{0,9}$/\n/mg;
613 $line =~ s/\n+/\n/sg;
614 }
615
616 # remove extraneous whitespace
617 $line =~ s/\n+/\n/gs;
618 $line =~ s/^\n//gs;
619
620 # output whatever is left
621 if ($line =~ /[^\n ]/) {
622 print OUT $line;
623 }
624 }
625 return 1;
626}
627
628
629
Note: See TracBrowser for help on using the repository browser.