source: trunk/gsdl/bin/script/gsConvert.pl@ 1970

Last change on this file since 1970 was 1970, checked in by sjboddie, 23 years ago

Added more usage information to all perl programs and removed a few
programs that are no longer useful.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.4 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML ot TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. These are usually found in the
31# $GSDLHOME/packages directory.
32#
33# Currently, we can convert Microsoft Word and Adobe PDF using specialised
34# conversion utilities. We can convery any file to text with a perl
35# implementation of the UNIX strings command.
36
37
38BEGIN {
39 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
41}
42
43use parsargv;
44use util;
45use Cwd;
46use File::Basename;
47
48
49sub print_usage
50{
51 print STDERR "\n";
52 print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
53 print STDERR " or text using third-party programs.\n\n";
54 print STDERR " usage: $0 [options] filename\n";
55 print STDERR " options:\n\t-type\tdoc|pdf\n\t-output\thtml|text\n";
56 print STDERR "\t-timeout\t<max cpu seconds>\n";
57 exit(1);
58}
59
60
61sub main
62{
63 my (@ARGV) = @_;
64 my ($input_type,$output_type,$verbose,$timeout);
65
66 $timeout = 0;
67 # read command-line arguments
68 if (!parsargv::parse(\@ARGV,
69 'type/(doc|pdf)/', \$input_type,
70 'output/(html|text)/', \$output_type,
71 'timeout/\d+/0',\$timeout,
72 'verbose/\d+/0', \$verbose))
73 {
74 print_usage();
75 }
76
77 # Make sure the input file exists and can be opened for reading
78 if (scalar(@ARGV!=1)) {
79 print_usage();
80 }
81
82 my $input_filename = $ARGV[0];
83 if (!-r $input_filename) {
84 print STDERR "Error: unable to open $input_filename for reading\n";
85 exit(1);
86 }
87
88 # Deduce filenames
89 my ($tailname,$dirname,$suffix)
90 = File::Basename::fileparse($input_filename,'\..+');
91 my $output_filestem = &util::filename_cat($dirname,"$tailname");
92
93 if ($input_type eq "")
94 {
95 $input_type = substr($suffix,1,length($suffix)-1);
96 }
97
98 # Change to temporary working directory
99 my $stored_dir = cwd();
100 chdir ($dirname) || die "Unable to change to directory $dirname";
101
102 # Select convert utility
103 if (!defined $input_type) {
104 print STDERR "Error: No filename extension or input type defined\n";
105 exit(1);
106 }
107 elsif ($input_type eq "doc") {
108 print &convertDOC($input_filename, $output_filestem, $output_type);
109 print "\n";
110 }
111 elsif ($input_type eq "rtf") {
112 print &convertRTF($input_filename, $output_filestem, $output_type);
113 print "\n";
114 }
115 elsif ($input_type eq "pdf") {
116 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
117 print "\n";
118 }
119 elsif ($input_type eq "ps") {
120 print &convertPS($input_filename, $output_filestem, $output_type);
121 print "\n";
122 }
123 else {
124 print STDERR "Error: Unable to convert type '$input_type'\n";
125 exit(1);
126 }
127
128 # restore to original working directory
129 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
130
131}
132
133&main(@ARGV);
134
135
136
137# Document-type conversion fucntions
138#
139# The following functions attempt to convert documents from their
140# input type to the specified output type. If no output type was
141# given, then they first attempt HTML, and then TEXT.
142#
143# Each returns the output type ("html" or "text") or "fail" if no
144# conversion is possible.
145
146# Convert a Microsoft word document
147
148sub convertDOC {
149 ($input_filename, $output_filestem, $output_type) = @_;
150
151 # Many .doc files are not in fact word documents!
152 my $realtype = &find_docfile_type($input_filename);
153
154 if ($realtype eq "word6" || $realtype eq "word7" || $realtype eq "word8") {
155 return &convertWord678($input_filename, $output_filestem, $output_type);
156 } elsif ($realtype eq "rtf") {
157 return &convertRTF($input_filename, $output_filestem, $output_type);
158 } else {
159 return &convertAnything($input_filename, $output_filestem, $output_type);
160 }
161}
162
163# Convert a Microsoft word 6/7/8 document
164
165sub convertWord678 {
166 ($input_filename, $output_filestem, $output_type) = @_;
167
168 my $success = 0;
169
170 # Attempt specialised conversion to HTML
171 if (!$output_type || ($output_type =~ /html/i)) {
172 $success = &doc_to_html($input_filename, $output_filestem);
173 if ($success) {
174 return "html";
175 }
176 }
177
178 return &convertAnything($input_filename, $output_filestem, $output_type);
179}
180
181
182# Convert a Rich Text Format (RTF) file
183
184sub convertRTF {
185 ($input_filename, $output_filestem, $output_type) = @_;
186
187 my $success = 0;
188
189 # Attempt specialised conversion to HTML
190 if (!$output_type || ($output_type =~ /html/i)) {
191 $success = &rtf_to_html($input_filename, $output_filestem);
192 if ($success) {
193 return "html";
194 }
195 }
196
197 return &convertAnything($input_filename, $output_filestem, $output_type);
198}
199
200
201# Convert an unidentified file
202
203sub convertAnything {
204 ($input_filename, $output_filestem, $output_type) = @_;
205
206 my $success = 0;
207
208 # Attempt simple conversion to HTML
209 if (!$output_type || ($output_type =~ /html/i)) {
210 $success = &any_to_html($input_filename, $output_filestem);
211 if ($success) {
212 return "html";
213 }
214 }
215
216 # Convert to text
217 if (!$output_type || ($output_type =~ /text/i)) {
218 $success = any_to_text($input_filename, $output_filestem);
219 if ($success) {
220 return "text";
221 }
222 }
223 return "fail";
224}
225
226
227
228# Convert an Adobe PDF document
229
230sub convertPDF {
231 ($dirname, $input_filename, $output_filestem, $output_type) = @_;
232
233 my $success = 0;
234
235 # Attempt conversion to HTML
236 if (!$output_type || ($output_type =~ /html/i)) {
237 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
238 if ($success) {
239 return "html";
240 }
241 }
242
243 # Attempt conversion to TEXT
244 if (!$output_type || ($output_type =~ /text/i)) {
245 $success = &pdf_to_text($input_filename, $output_filestem);
246 if ($success) {
247 return "text";
248 }
249 }
250
251 return "fail";
252
253}
254
255
256# Convert an Adobe PostScript document
257
258sub convertPS {
259 ($input_filename, $output_filestem, $output_type) = @_;
260
261 my $success = 0;
262
263 # Attempt conversion to TEXT
264 if (!$output_type || ($output_type =~ /text/i)) {
265 $success = &ps_to_text($input_filename, $output_filestem);
266 if ($success) {
267 return "text";
268 }
269 }
270
271 return "fail";
272
273}
274
275
276# Find the real type of a .doc file
277#
278# We seem to have alot of files with a .dco extension that are .rtf
279# files or Word 5 files. This function attempts to tell the difference.
280
281sub find_docfile_type {
282 ($input_filename) = @_;
283
284 open(CHK, "<$input_filename");
285 binmode(CHK);
286 my $line = "";
287 my $first = 1;
288
289 while (<CHK>) {
290
291 $line = $_;
292
293 if ($first) {
294 # check to see if this is an rtf file
295 if ($line =~ /^\{\\rtf/) {
296 close(CHK);
297 return "rtf";
298 }
299 }
300
301 # is this is a word 6/7/8 document?
302 if ($line =~ /Word\.Document\.([678])/) {
303 close(CHK);
304 return "word$1";
305 }
306
307 $first = 0;
308
309 }
310
311 return "unknown";
312}
313
314
315
316# Specific type-to-type conversions
317#
318# Each of the following functions attempts to convert a document from
319# a specific format to another. If they succeed yhey return 1 and leave
320# the output document(s) in the appropriate place; if they fail they
321# return 0 and delete any working files.
322
323
324# Attempt to convert a word document to html with the wv program
325
326sub doc_to_html {
327 ($input_filename, $output_filestem) = @_;
328
329 my $wvWare = "";
330 my $wv_conf = "";
331
332 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
333 $wvWare = "$ENV{'GSDLHOME'}\\bin\\windows\\wvWare.exe";
334 $wv_conf = "$ENV{'GSDLHOME'}\\bin\\windows\\wvHtml.xml";
335
336 } else {
337 # formulate the command
338 my $wv_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", "wv");
339 $wv_conf = &util::filename_cat($wv_home, "lib", "wv", "wvHtml.xml");
340 $wvWare = &util::filename_cat($wv_home, "bin", "wvWare");
341 }
342 return 0 unless (-e "$wvWare");
343
344 $cmd = "";
345 if ($timeout) {$cmd = "ulimit -t $timeout;";}
346 $cmd .= "$wvWare --charset utf-8 --config $wv_conf";
347 $cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
348
349 # execute the command
350 if (system($cmd)>0)
351 {
352 print STDERR "Error executing wv converter: $!. Continuing...\n";
353 }
354
355 # Was the conversion successful?
356 if (-e "$output_filestem.html") {
357 open(TMP, "$output_filestem.html");
358 $line = <TMP>;
359 close(TMP);
360 if ($line && $line =~ /DOCTYPE HTML/) {
361 &util::rm("$output_filestem.err");
362 return 1;
363 } else {
364 # An error of some sort occurred
365 &util::rm("$output_filestem.html");
366 &util::rm("$output_filestem.err");
367 }
368 }
369
370 return 0;
371}
372
373
374# Attempt to convert an RTF document to html with rtftohtml
375#
376# rtf2html isn't distributed with Greenstone because it is not
377# distributed under teh GPL. If you know of a better solution,
378# please let me know.
379
380sub rtf_to_html {
381 ($input_filename, $output_filestem) = @_;
382
383 # formulate the command
384 my $r_cmd = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix",
385 "rtf2html", "rtf2html", "rtf2html");
386 $r_cmd = "rtf2html" unless (-e "$r_cmd");
387 return 0 unless (-e "$r_cmd");
388 $cmd = "";
389 if ($timeout) {$cmd = "ulimit -t $timeout;";}
390 $cmd .= "$r_cmd";
391 $cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
392
393 # execute the command
394 if (system($cmd)>0)
395 {
396 print STDERR "Error executing rtf converter: $!. Continuing...\n";
397 }
398
399 # Was the conversion successful?
400 if (-e "$output_filestem.html") {
401 open(TMP, "$output_filestem.html");
402 $line = <TMP>;
403 close(TMP);
404 if ($line && $line =~ /DOCTYPE HTML/) {
405 &util::rm("$output_filestem.err");
406 return 1;
407 } else {
408 # An error of some sort occurred
409 &util::rm("$output_filestem.html");
410 &util::rm("$output_filestem.err");
411 }
412 }
413 return 0;
414}
415
416
417# Convert a pdf file to html with the pdftohtml command
418
419sub pdf_to_html {
420 ($dirname, $input_filename, $output_filestem) = @_;
421
422 $cmd = "";
423 if ($timeout) {$cmd = "ulimit -t $timeout;";}
424 $cmd .= "pdftohtml.pl -F ";
425 $cmd .= " \"$input_filename\" \"$output_filestem\"";
426
427 if (system($cmd)>0)
428 {
429 print STDERR "Error executing $cmd: $!\n";
430 return 0;
431 }
432
433 # make sure the converter made something
434 if (! -e "$output_filestem.html")
435 {
436 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
437 # print out the converters std err, if any
438 if (-e "$output_filestem.err") {
439 open (ERRLOG, "$output_filestem.err") || die "$!";
440 print STDERR "pdftohtml:\n";
441 while (<ERRLOG>) {
442 print STDERR "$_";
443 }
444 close ERRLOG;
445 }
446 return 0;
447 }
448
449 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
450 return 1;
451}
452
453# Convert a PDF file to text with the pdftotext command
454
455sub pdf_to_text {
456 ($dirname, $input_filename, $output_filestem) = @_;
457
458 $cmd = "pdftotext \"$input_filename\" > \"$output_filestem.text\"";
459 $cmd .= " 2> $output_filestem.err";
460
461 if (system($cmd)>0)
462 {
463 print STDERR "Error executing $cmd: $!\n";
464 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
465 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
466 return 0;
467 }
468
469 # make sure the converter made something
470 if (! -e "$output_filestem.html")
471 {
472 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
473 # print out the converters std err, if any
474 if (-e "$output_filestem.err") {
475 open (ERRLOG, "$output_filestem.err") || die "$!";
476 print STDERR "pdftotext:\n";
477 while (<ERRLOG>) {
478 print STDERR "$_";
479 }
480 close ERRLOG;
481 }
482 return 0;
483 }
484
485 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
486 return 1;
487}
488
489# Convert a PostScript document to text with ps2ascii
490
491sub ps_to_text {
492 ($input_filename, $output_filestem) = @_;
493
494 my $cmd = "ps2ascii \"$input_filename\" > \"$output_filestem.text\"";
495 $cmd .= " 2> $output_filestem.err";
496
497 if (system($cmd)>0)
498 {
499 print STDERR "Error executing $cmd: $!\n";
500 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
501 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
502 return 0;
503 }
504
505 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
506 return 1;
507}
508
509
510# Convert any file to HTML with a crude perl implementation of the
511# UNIX strings command.
512
513sub any_to_html {
514 ($input_filename, $output_filestem) = @_;
515
516 # First generate a text file
517 return 0 unless (&any_to_text($input_filename, $output_filestem));
518
519 # create an HTML file from the text file
520 open(TEXT, "<$output_filestem.text");
521 open(HTML, ">$output_filestem.html");
522
523 print HTML '<html><head>
524<META HTTP-EQUIV="Content-Type" CONTENT="text/html">
525<META NAME="GENERATOR" CONTENT="Greenstone any_to_html">
526</head><body>';
527 print HTML "\n\n";
528
529 while (<TEXT>) {
530 print HTML "<p> ", $_;
531
532 }
533 print HTML "\n</body></html>\n";
534
535 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
536 return 1;
537}
538
539# Convert any file to TEXT with a crude perl implementation of the
540# UNIX strings command.
541
542sub any_to_text {
543 ($input_filename, $output_filestem) = @_;
544
545 open(IN, "<$input_filename");
546 binmode(IN);
547 open(OUT, ">$output_filestem.text");
548
549 my ($line);
550 my $dgcount = 0;
551 while (<IN>) {
552 $line = $_;
553
554 # delete anything that isn't a printable character
555 $line =~ s/[^\040-\176]+/\n/sg;
556
557 # delete any string less than 10 characters long
558 $line =~ s/^.{0,9}$/\n/mg;
559 while ($line =~ /^.{1,9}$/m) {
560 $line =~ s/^.{0,9}$/\n/mg;
561 $line =~ s/\n+/\n/sg;
562 }
563
564 # remove extraneous whitespace
565 $line =~ s/\n+/\n/gs;
566 $line =~ s/^\n//gs;
567
568 # output whatever is left
569 if ($line =~ /[^\n ]/) {
570 print OUT $line;
571 }
572 }
573 return 1;
574}
Note: See TracBrowser for help on using the repository browser.