source: trunk/gsdl/bin/script/gsConvert.pl@ 1654

Last change on this file since 1654 was 1654, checked in by paynter, 24 years ago

Check .doc files to see if they are RTF files, Word 6/7/8 files that wv
handles, or "unknown" files (which we strip of binary characters and hope
the result is worthwhile).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.8 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# gsConvert.pl -- convert documents to HTML ot TEXT format
6#
7# A component of the Greenstone digital library software
8# from the New Zealand Digital Library Project at the
9# University of Waikato, New Zealand.
10#
11# Copyright (C) 1999 New Zealand Digital Library Project
12#
13# This program is free software; you can redistribute it and/or modify
14# it under the terms of the GNU General Public License as published by
15# the Free Software Foundation; either version 2 of the License, or
16# (at your option) any later version.
17#
18# This program is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21# GNU General Public License for more details.
22#
23# You should have received a copy of the GNU General Public License
24# along with this program; if not, write to the Free Software
25# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26#
27###########################################################################
28
29# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30# by exploiting third-party programs. These are usually found in the
31# $GSDLHOME/packages directory.
32#
33# Currently, we can convert Microsoft Word and Adobe PDF using specialised
34# conversion utilities. We can convery any file to text with a perl
35# implementation of the UNIX strings command.
36
37
38BEGIN {
39 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
41}
42
43use parsargv;
44use util;
45use Cwd;
46use File::Basename;
47
48
49sub print_usage
50{
51 print STDERR "Usage: $0 [-type doc|pdf] [-output html|text] filename\n";
52 exit(1);
53}
54
55
56sub main
57{
58 my (@ARGV) = @_;
59 my ($input_type,$output_type,$verbose);
60
61 # read command-line arguments
62 if (!parsargv::parse(\@ARGV,
63 'type/(doc|pdf)/', \$input_type,
64 'output/(html|text)/', \$output_type,
65 'verbose/\d+/0', \$verbose))
66 {
67 print_usage();
68 }
69
70 # Make sure the input file exists and can be opened for reading
71 if (scalar(@ARGV!=1)) {
72 print_usage();
73 }
74 my $input_filename = $ARGV[0];
75 if (!-r $input_filename) {
76 print STDERR "Error: unable to open $input_filename for reading\n";
77 exit(1);
78 }
79
80 # Deduce filenames
81 my ($tailname,$dirname,$suffix)
82 = File::Basename::fileparse($input_filename,'\..+');
83 my $output_filestem = &util::filename_cat($dirname,"$tailname");
84
85 if ($input_type eq "")
86 {
87 $input_type = substr($suffix,1,length($suffix)-1);
88 }
89
90 # Change to temporary working directory
91 my $stored_dir = cwd();
92 chdir ($dirname) || die "Unable to change to directory $dirname";
93
94 # Select convert utility
95 if (!defined $input_type) {
96 print STDERR "Error: No filename extension or input type defined\n";
97 exit(1);
98 }
99 elsif ($input_type eq "doc") {
100 print &convertDOC($input_filename, $output_filestem, $output_type);
101 print "\n";
102 }
103 elsif ($input_type eq "pdf") {
104 print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
105 print "\n";
106 }
107 elsif ($input_type eq "ps") {
108 print &convertPS($input_filename, $output_filestem, $output_type);
109 print "\n";
110 }
111 else {
112 print STDERR "Error: Unable to convert type '$input_type'\n";
113 exit(1);
114 }
115
116 # restore to original working directory
117 chdir ($stored_dir) || die "Unable to return to directory $stored_dir";
118
119}
120
121&main(@ARGV);
122
123
124
125# Document-type conversion fucntions
126#
127# The following functions attempt to convert documents from their
128# input type to the specified output type. If no output type was
129# given, then they first attempt HTML, and then TEXT.
130#
131# Each returns the output type ("html" or "text") or "fail" if no
132# conversion is possible.
133
134# Convert a Microsoft word document
135
136sub convertDOC {
137 ($input_filename, $output_filestem, $output_type) = @_;
138
139 # Many .doc files are not in fact word documents!
140 my $realtype = &find_docfile_type($input_filename);
141
142 if ($realtype eq "word678") {
143 return &convertWord678($input_filename, $output_filestem, $output_type);
144 } elsif ($realtype eq "rtf") {
145 return &convertRTF($input_filename, $output_filestem, $output_type);
146 } else {
147 return &convertAnything($input_filename, $output_filestem, $output_type);
148 }
149}
150
151# Convert a Microsoft word 6/7/8 document
152
153sub convertWord678 {
154 ($input_filename, $output_filestem, $output_type) = @_;
155
156 my $success = 0;
157
158 # Attempt specialised conversion to HTML
159 if (!$output_type || ($output_type =~ /html/i)) {
160 $success = &doc_to_html($input_filename, $output_filestem);
161 if ($success) {
162 return "html";
163 }
164 }
165
166 return &convertAnything($input_filename, $output_filestem, $output_type);
167}
168
169
170# Convert a Rich Text Format (RTF) file
171
172sub convertRTF {
173 ($input_filename, $output_filestem, $output_type) = @_;
174
175 my $success = 0;
176
177 # Attempt specialised conversion to HTML
178 if (!$output_type || ($output_type =~ /html/i)) {
179 $success = &rtf_to_html($input_filename, $output_filestem);
180 if ($success) {
181 return "html";
182 }
183 }
184
185 return &convertAnything($input_filename, $output_filestem, $output_type);
186}
187
188
189# Convert an unidentified file
190
191sub convertAnything {
192 ($input_filename, $output_filestem, $output_type) = @_;
193
194 my $success = 0;
195
196 # Attempt simple conversion to HTML
197 if (!$output_type || ($output_type =~ /html/i)) {
198 $success = &any_to_html($input_filename, $output_filestem);
199 if ($success) {
200 return "html";
201 }
202 }
203
204 # Convert to text
205 if (!$output_type || ($output_type =~ /text/i)) {
206 $success = any_to_text($input_filename, $output_filestem);
207 if ($success) {
208 return "text";
209 }
210 }
211 return "fail";
212}
213
214
215
216# Convert an Adobe PDF document
217
218sub convertPDF {
219 ($dirname, $input_filename, $output_filestem, $output_type) = @_;
220
221 my $success = 0;
222
223 # Attempt conversion to HTML
224 if (!$output_type || ($output_type =~ /html/i)) {
225 $success = &pdf_to_html($dirname, $input_filename, $output_filestem);
226 if ($success) {
227 return "html";
228 }
229 }
230
231 # Attempt conversion to TEXT
232 if (!$output_type || ($output_type =~ /text/i)) {
233 $success = &pdf_to_text($input_filename, $output_filestem);
234 if ($success) {
235 return "text";
236 }
237 }
238
239 return "fail";
240
241}
242
243
244# Convert an Adobe PostScript document
245
246sub convertPS {
247 ($input_filename, $output_filestem, $output_type) = @_;
248
249 my $success = 0;
250
251 # Attempt conversion to TEXT
252 if (!$output_type || ($output_type =~ /text/i)) {
253 $success = &ps_to_text($input_filename, $output_filestem);
254 if ($success) {
255 return "text";
256 }
257 }
258
259 return "fail";
260
261}
262
263
264# Find the real type of a .doc file
265#
266# We seem to have alot of files with a .dco extension that are .rtf
267# files or Word 5 files. This function attempts to tell the difference.
268
269sub find_docfile_type {
270 ($input_filename) = @_;
271
272 open(CHK, "<$input_filename");
273 my $line = "";
274 my $first = 1;
275
276 while (<CHK>) {
277
278 $line = $_;
279
280 if ($first) {
281 # check to see if this is an rtf file
282 if ($line =~ /^\{\\rtf/) {
283 close(CHK);
284 return "rtf";
285 }
286 }
287
288 # is theis a word 6/7/8 document?
289 if ($line =~ /Word\.Document\.[678]/) {
290 close(CHK);
291 return "word678";
292 }
293
294 $first = 0;
295
296 }
297
298 return "unknown";
299}
300
301
302
303# Specific type-to-type cponversions
304#
305# Each of the following functions attempts to convert a document from
306# a specific format to another. If they succeed yhey return 1 and leave
307# the output document(s) in the appropriate place; if they fail they
308# return 0 and delete any working files.
309
310
311# Attempt to convert a word document to html with the wv program
312
313sub doc_to_html {
314 ($input_filename, $output_filestem) = @_;
315
316 # formulate the command
317 my $wv_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", "wv");
318 my $wv_conf = &util::filename_cat($wv_home, "lib", "wv", "wvHtml.xml");
319 my $wvWare = &util::filename_cat($wv_home, "bin", "wvWare");
320 return 0 unless (-e "$wvWare");
321 $cmd = "ulimit -t 20;";
322 $cmd .= "$wvWare --charset utf-8 --config $wv_conf";
323 $cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
324
325 # execute the command
326 if (system($cmd)>0)
327 {
328 print STDERR "Error executing wv converter: $!. Continuing...\n";
329 }
330
331 # Was the conversion successful?
332 if (-e "$output_filestem.html") {
333 open(TMP, "$output_filestem.html");
334 $line = <TMP>;
335 close(TMP);
336 if ($line && $line =~ /DOCTYPE HTML/) {
337 &util::rm("$output_filestem.err");
338 return 1;
339 } else {
340 # An error of some sort occurred
341 &util::rm("$output_filestem.html");
342 &util::rm("$output_filestem.err");
343 }
344 }
345 return 0;
346}
347
348
349# Attempt to convert an RTF document to html with rtftohtml
350#
351# rtf2html isn't distributed with Greenstone because it is not
352# distributed under teh GPL. If you know of a better solution,
353# please let me know.
354
355sub rtf_to_html {
356 ($input_filename, $output_filestem) = @_;
357
358 # formulate the command
359 my $r_cmd = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix",
360 "rtf2html", "rtf2html", "rtf2html");
361 $r_cmd = "rtf2html" unless (-e "$r_cmd");
362 return 0 unless (-e "$r_cmd");
363 $cmd = "ulimit -t 20;";
364 $cmd .= "$r_cmd";
365 $cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
366
367 # execute the command
368 if (system($cmd)>0)
369 {
370 print STDERR "Error executing rtf converter: $!. Continuing...\n";
371 }
372
373 # Was the conversion successful?
374 if (-e "$output_filestem.html") {
375 open(TMP, "$output_filestem.html");
376 $line = <TMP>;
377 close(TMP);
378 if ($line && $line =~ /DOCTYPE HTML/) {
379 &util::rm("$output_filestem.err");
380 return 1;
381 } else {
382 # An error of some sort occurred
383 &util::rm("$output_filestem.html");
384 &util::rm("$output_filestem.err");
385 }
386 }
387 return 0;
388}
389
390
391# Convert a pdf file to html with the pdftohtml command
392
393sub pdf_to_html {
394 ($dirname, $input_filename, $output_filestem) = @_;
395
396 $cmd = "pdftohtml -F -d $dirname -o \"$output_filestem.html\" \"$input_filename\"";
397 $cmd .= " > $output_filestem.out";
398
399 if (system($cmd)>0)
400 {
401 print STDERR "Error executing $cmd: $!\n";
402 &util::rm("$output_filestem.html") if (-e "$output_filestem.html");
403 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
404 return 0;
405 }
406
407 &util::rm("$output_filestem.out") if (-e "$output_filestem.out");
408 return 1;
409}
410
411
412# Convert a PDF file to text with the pdftotext command
413
414sub pdf_to_text {
415 ($dirname, $input_filename, $output_filestem) = @_;
416
417 $cmd = "pdftotext \"$input_filename\" > \"$output_filestem.text\"";
418 $cmd .= " 2> $output_filestem.err";
419
420 if (system($cmd)>0)
421 {
422 print STDERR "Error executing $cmd: $!\n";
423 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
424 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
425 return 0;
426 }
427
428 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
429 return 1;
430}
431
432
433# Convert a PostScript document to text with ps2ascii
434
435sub ps_to_text {
436 ($input_filename, $output_filestem) = @_;
437
438 my $cmd = "ps2ascii \"$input_filename\" > \"$output_filestem.text\"";
439 $cmd .= " 2> $output_filestem.err";
440
441 if (system($cmd)>0)
442 {
443 print STDERR "Error executing $cmd: $!\n";
444 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
445 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
446 return 0;
447 }
448
449 &util::rm("$output_filestem.err") if (-e "$output_filestem.err");
450 return 1;
451}
452
453
454# Convert any file to HTML with a crude perl implementation of the
455# UNIX strings command.
456
457sub any_to_html {
458 ($input_filename, $output_filestem) = @_;
459
460 # First generate a text file
461 return 0 unless (&any_to_text($input_filename, $output_filestem));
462
463 # create an HTML file from the text file
464 open(TEXT, "<$output_filestem.text");
465 open(HTML, ">$output_filestem.html");
466
467 print HTML '<html><head>
468<META HTTP-EQUIV="Content-Type" CONTENT="text/html">
469<META NAME="GENERATOR" CONTENT="Greenstone any_to_html">
470</head><body>\n\n';
471 while (<TEXT>) {
472 print HTML "<p> ", $_;
473
474 }
475 print HTML "\n</body></html>]\n";
476
477 &util::rm("$output_filestem.text") if (-e "$output_filestem.text");
478 return 1;
479}
480
481# Convert any file to TEXT with a crude perl implementation of the
482# UNIX strings command.
483
484sub any_to_text {
485 ($input_filename, $output_filestem) = @_;
486
487 open(IN, "<$input_filename");
488 open(OUT, ">$output_filestem.text");
489
490 my ($line);
491 while (<IN>) {
492 $line = $_;
493
494 # delete anything that isn't a printable character
495 $line =~ s/[^\040-\176]+/\n/sg;
496
497 # delete any string less than 10 characters long
498 $line =~ s/^[^\n]{0,9}$/\n/mg;
499 while ($line =~ /^[^\n]{1,9}$/m) {
500 $line =~ s/^[^\n]{0,9}$/\n/mg;
501 $line =~ s/\n+/\n/sg;
502 }
503
504 # remove extraneous whitespace
505 $line =~ s/\n+/\n/gs;
506 $line =~ s/^\n//gs;
507
508 # output whatever is left
509 if ($line =~ /[^\n ]/) {
510 print OUT $line;
511 }
512 }
513 return 1;
514}
515
516
517
Note: See TracBrowser for help on using the repository browser.