Context Navigation

source: trunk/gsdl/bin/script/gsConvert.pl@ 1654

Last change on this file since 1654 was 1654, checked in by paynter, 24 years ago
Check .doc files to see if they are RTF files, Word 6/7/8 files that wv handles, or "unknown" files (which we strip of binary characters and hope the result is worthwhile).
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 12.8 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML ot TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. These are usually found in the
31	# $GSDLHOME/packages directory.
32	#
33	# Currently, we can convert Microsoft Word and Adobe PDF using specialised
34	# conversion utilities. We can convery any file to text with a perl
35	# implementation of the UNIX strings command.
36
37
38	BEGIN {
39	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
41	}
42
43	use parsargv;
44	use util;
45	use Cwd;
46	use File::Basename;
47
48
49	sub print_usage
50	{
51	print STDERR "Usage: $0 [-type doc\|pdf] [-output html\|text] filename\n";
52	exit(1);
53	}
54
55
56	sub main
57	{
58	my (@ARGV) = @_;
59	my ($input_type,$output_type,$verbose);
60
61	# read command-line arguments
62	if (!parsargv::parse(\@ARGV,
63	'type/(doc\|pdf)/', \$input_type,
64	'output/(html\|text)/', \$output_type,
65	'verbose/\d+/0', \$verbose))
66	{
67	print_usage();
68	}
69
70	# Make sure the input file exists and can be opened for reading
71	if (scalar(@ARGV!=1)) {
72	print_usage();
73	}
74	my $input_filename = $ARGV[0];
75	if (!-r $input_filename) {
76	print STDERR "Error: unable to open $input_filename for reading\n";
77	exit(1);
78	}
79
80	# Deduce filenames
81	my ($tailname,$dirname,$suffix)
82	= File::Basename::fileparse($input_filename,'\..+');
83	my $output_filestem = &util::filename_cat($dirname,"$tailname");
84
85	if ($input_type eq "")
86	{
87	$input_type = substr($suffix,1,length($suffix)-1);
88	}
89
90	# Change to temporary working directory
91	my $stored_dir = cwd();
92	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
93
94	# Select convert utility
95	if (!defined $input_type) {
96	print STDERR "Error: No filename extension or input type defined\n";
97	exit(1);
98	}
99	elsif ($input_type eq "doc") {
100	print &convertDOC($input_filename, $output_filestem, $output_type);
101	print "\n";
102	}
103	elsif ($input_type eq "pdf") {
104	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
105	print "\n";
106	}
107	elsif ($input_type eq "ps") {
108	print &convertPS($input_filename, $output_filestem, $output_type);
109	print "\n";
110	}
111	else {
112	print STDERR "Error: Unable to convert type '$input_type'\n";
113	exit(1);
114	}
115
116	# restore to original working directory
117	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
118
119	}
120
121	&main(@ARGV);
122
123
124
125	# Document-type conversion fucntions
126	#
127	# The following functions attempt to convert documents from their
128	# input type to the specified output type. If no output type was
129	# given, then they first attempt HTML, and then TEXT.
130	#
131	# Each returns the output type ("html" or "text") or "fail" if no
132	# conversion is possible.
133
134	# Convert a Microsoft word document
135
136	sub convertDOC {
137	($input_filename, $output_filestem, $output_type) = @_;
138
139	# Many .doc files are not in fact word documents!
140	my $realtype = &find_docfile_type($input_filename);
141
142	if ($realtype eq "word678") {
143	return &convertWord678($input_filename, $output_filestem, $output_type);
144	} elsif ($realtype eq "rtf") {
145	return &convertRTF($input_filename, $output_filestem, $output_type);
146	} else {
147	return &convertAnything($input_filename, $output_filestem, $output_type);
148	}
149	}
150
151	# Convert a Microsoft word 6/7/8 document
152
153	sub convertWord678 {
154	($input_filename, $output_filestem, $output_type) = @_;
155
156	my $success = 0;
157
158	# Attempt specialised conversion to HTML
159	if (!$output_type \|\| ($output_type =~ /html/i)) {
160	$success = &doc_to_html($input_filename, $output_filestem);
161	if ($success) {
162	return "html";
163	}
164	}
165
166	return &convertAnything($input_filename, $output_filestem, $output_type);
167	}
168
169
170	# Convert a Rich Text Format (RTF) file
171
172	sub convertRTF {
173	($input_filename, $output_filestem, $output_type) = @_;
174
175	my $success = 0;
176
177	# Attempt specialised conversion to HTML
178	if (!$output_type \|\| ($output_type =~ /html/i)) {
179	$success = &rtf_to_html($input_filename, $output_filestem);
180	if ($success) {
181	return "html";
182	}
183	}
184
185	return &convertAnything($input_filename, $output_filestem, $output_type);
186	}
187
188
189	# Convert an unidentified file
190
191	sub convertAnything {
192	($input_filename, $output_filestem, $output_type) = @_;
193
194	my $success = 0;
195
196	# Attempt simple conversion to HTML
197	if (!$output_type \|\| ($output_type =~ /html/i)) {
198	$success = &any_to_html($input_filename, $output_filestem);
199	if ($success) {
200	return "html";
201	}
202	}
203
204	# Convert to text
205	if (!$output_type \|\| ($output_type =~ /text/i)) {
206	$success = any_to_text($input_filename, $output_filestem);
207	if ($success) {
208	return "text";
209	}
210	}
211	return "fail";
212	}
213
214
215
216	# Convert an Adobe PDF document
217
218	sub convertPDF {
219	($dirname, $input_filename, $output_filestem, $output_type) = @_;
220
221	my $success = 0;
222
223	# Attempt conversion to HTML
224	if (!$output_type \|\| ($output_type =~ /html/i)) {
225	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
226	if ($success) {
227	return "html";
228	}
229	}
230
231	# Attempt conversion to TEXT
232	if (!$output_type \|\| ($output_type =~ /text/i)) {
233	$success = &pdf_to_text($input_filename, $output_filestem);
234	if ($success) {
235	return "text";
236	}
237	}
238
239	return "fail";
240
241	}
242
243
244	# Convert an Adobe PostScript document
245
246	sub convertPS {
247	($input_filename, $output_filestem, $output_type) = @_;
248
249	my $success = 0;
250
251	# Attempt conversion to TEXT
252	if (!$output_type \|\| ($output_type =~ /text/i)) {
253	$success = &ps_to_text($input_filename, $output_filestem);
254	if ($success) {
255	return "text";
256	}
257	}
258
259	return "fail";
260
261	}
262
263
264	# Find the real type of a .doc file
265	#
266	# We seem to have alot of files with a .dco extension that are .rtf
267	# files or Word 5 files. This function attempts to tell the difference.
268
269	sub find_docfile_type {
270	($input_filename) = @_;
271
272	open(CHK, "<$input_filename");
273	my $line = "";
274	my $first = 1;
275
276	while (<CHK>) {
277
278	$line = $_;
279
280	if ($first) {
281	# check to see if this is an rtf file
282	if ($line =~ /^\{\\rtf/) {
283	close(CHK);
284	return "rtf";
285	}
286	}
287
288	# is theis a word 6/7/8 document?
289	if ($line =~ /Word\.Document\.[678]/) {
290	close(CHK);
291	return "word678";
292	}
293
294	$first = 0;
295
296	}
297
298	return "unknown";
299	}
300
301
302
303	# Specific type-to-type cponversions
304	#
305	# Each of the following functions attempts to convert a document from
306	# a specific format to another. If they succeed yhey return 1 and leave
307	# the output document(s) in the appropriate place; if they fail they
308	# return 0 and delete any working files.
309
310
311	# Attempt to convert a word document to html with the wv program
312
313	sub doc_to_html {
314	($input_filename, $output_filestem) = @_;
315
316	# formulate the command
317	my $wv_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", "wv");
318	my $wv_conf = &util::filename_cat($wv_home, "lib", "wv", "wvHtml.xml");
319	my $wvWare = &util::filename_cat($wv_home, "bin", "wvWare");
320	return 0 unless (-e "$wvWare");
321	$cmd = "ulimit -t 20;";
322	$cmd .= "$wvWare --charset utf-8 --config $wv_conf";
323	$cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
324
325	# execute the command
326	if (system($cmd)>0)
327	{
328	print STDERR "Error executing wv converter: $!. Continuing...\n";
329	}
330
331	# Was the conversion successful?
332	if (-e "$output_filestem.html") {
333	open(TMP, "$output_filestem.html");
334	$line = <TMP>;
335	close(TMP);
336	if ($line && $line =~ /DOCTYPE HTML/) {
337	&util::rm("$output_filestem.err");
338	return 1;
339	} else {
340	# An error of some sort occurred
341	&util::rm("$output_filestem.html");
342	&util::rm("$output_filestem.err");
343	}
344	}
345	return 0;
346	}
347
348
349	# Attempt to convert an RTF document to html with rtftohtml
350	#
351	# rtf2html isn't distributed with Greenstone because it is not
352	# distributed under teh GPL. If you know of a better solution,
353	# please let me know.
354
355	sub rtf_to_html {
356	($input_filename, $output_filestem) = @_;
357
358	# formulate the command
359	my $r_cmd = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix",
360	"rtf2html", "rtf2html", "rtf2html");
361	$r_cmd = "rtf2html" unless (-e "$r_cmd");
362	return 0 unless (-e "$r_cmd");
363	$cmd = "ulimit -t 20;";
364	$cmd .= "$r_cmd";
365	$cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
366
367	# execute the command
368	if (system($cmd)>0)
369	{
370	print STDERR "Error executing rtf converter: $!. Continuing...\n";
371	}
372
373	# Was the conversion successful?
374	if (-e "$output_filestem.html") {
375	open(TMP, "$output_filestem.html");
376	$line = <TMP>;
377	close(TMP);
378	if ($line && $line =~ /DOCTYPE HTML/) {
379	&util::rm("$output_filestem.err");
380	return 1;
381	} else {
382	# An error of some sort occurred
383	&util::rm("$output_filestem.html");
384	&util::rm("$output_filestem.err");
385	}
386	}
387	return 0;
388	}
389
390
391	# Convert a pdf file to html with the pdftohtml command
392
393	sub pdf_to_html {
394	($dirname, $input_filename, $output_filestem) = @_;
395
396	$cmd = "pdftohtml -F -d $dirname -o \"$output_filestem.html\" \"$input_filename\"";
397	$cmd .= " > $output_filestem.out";
398
399	if (system($cmd)>0)
400	{
401	print STDERR "Error executing $cmd: $!\n";
402	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
403	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
404	return 0;
405	}
406
407	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
408	return 1;
409	}
410
411
412	# Convert a PDF file to text with the pdftotext command
413
414	sub pdf_to_text {
415	($dirname, $input_filename, $output_filestem) = @_;
416
417	$cmd = "pdftotext \"$input_filename\" > \"$output_filestem.text\"";
418	$cmd .= " 2> $output_filestem.err";
419
420	if (system($cmd)>0)
421	{
422	print STDERR "Error executing $cmd: $!\n";
423	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
424	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
425	return 0;
426	}
427
428	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
429	return 1;
430	}
431
432
433	# Convert a PostScript document to text with ps2ascii
434
435	sub ps_to_text {
436	($input_filename, $output_filestem) = @_;
437
438	my $cmd = "ps2ascii \"$input_filename\" > \"$output_filestem.text\"";
439	$cmd .= " 2> $output_filestem.err";
440
441	if (system($cmd)>0)
442	{
443	print STDERR "Error executing $cmd: $!\n";
444	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
445	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
446	return 0;
447	}
448
449	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
450	return 1;
451	}
452
453
454	# Convert any file to HTML with a crude perl implementation of the
455	# UNIX strings command.
456
457	sub any_to_html {
458	($input_filename, $output_filestem) = @_;
459
460	# First generate a text file
461	return 0 unless (&any_to_text($input_filename, $output_filestem));
462
463	# create an HTML file from the text file
464	open(TEXT, "<$output_filestem.text");
465	open(HTML, ">$output_filestem.html");
466
467	print HTML '<html><head>
468	<META HTTP-EQUIV="Content-Type" CONTENT="text/html">
469	<META NAME="GENERATOR" CONTENT="Greenstone any_to_html">
470	</head><body>\n\n';
471	while (<TEXT>) {
472	print HTML "<p> ", $_;
473
474	}
475	print HTML "\n</body></html>]\n";
476
477	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
478	return 1;
479	}
480
481	# Convert any file to TEXT with a crude perl implementation of the
482	# UNIX strings command.
483
484	sub any_to_text {
485	($input_filename, $output_filestem) = @_;
486
487	open(IN, "<$input_filename");
488	open(OUT, ">$output_filestem.text");
489
490	my ($line);
491	while (<IN>) {
492	$line = $_;
493
494	# delete anything that isn't a printable character
495	$line =~ s/[^\040-\176]+/\n/sg;
496
497	# delete any string less than 10 characters long
498	$line =~ s/^[^\n]{0,9}$/\n/mg;
499	while ($line =~ /^[^\n]{1,9}$/m) {
500	$line =~ s/^[^\n]{0,9}$/\n/mg;
501	$line =~ s/\n+/\n/sg;
502	}
503
504	# remove extraneous whitespace
505	$line =~ s/\n+/\n/gs;
506	$line =~ s/^\n//gs;
507
508	# output whatever is left
509	if ($line =~ /[^\n ]/) {
510	print OUT $line;
511	}
512	}
513	return 1;
514	}
515
516
517

Note: See TracBrowser for help on using the repository browser.

Download in other formats: