Context Navigation

source: trunk/gsdl/bin/script/gsConvert.pl@ 1970

Last change on this file since 1970 was 1970, checked in by sjboddie, 23 years ago
Added more usage information to all perl programs and removed a few programs that are no longer useful.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 14.4 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML ot TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. These are usually found in the
31	# $GSDLHOME/packages directory.
32	#
33	# Currently, we can convert Microsoft Word and Adobe PDF using specialised
34	# conversion utilities. We can convery any file to text with a perl
35	# implementation of the UNIX strings command.
36
37
38	BEGIN {
39	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
41	}
42
43	use parsargv;
44	use util;
45	use Cwd;
46	use File::Basename;
47
48
49	sub print_usage
50	{
51	print STDERR "\n";
52	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
53	print STDERR " or text using third-party programs.\n\n";
54	print STDERR " usage: $0 [options] filename\n";
55	print STDERR " options:\n\t-type\tdoc\|pdf\n\t-output\thtml\|text\n";
56	print STDERR "\t-timeout\t<max cpu seconds>\n";
57	exit(1);
58	}
59
60
61	sub main
62	{
63	my (@ARGV) = @_;
64	my ($input_type,$output_type,$verbose,$timeout);
65
66	$timeout = 0;
67	# read command-line arguments
68	if (!parsargv::parse(\@ARGV,
69	'type/(doc\|pdf)/', \$input_type,
70	'output/(html\|text)/', \$output_type,
71	'timeout/\d+/0',\$timeout,
72	'verbose/\d+/0', \$verbose))
73	{
74	print_usage();
75	}
76
77	# Make sure the input file exists and can be opened for reading
78	if (scalar(@ARGV!=1)) {
79	print_usage();
80	}
81
82	my $input_filename = $ARGV[0];
83	if (!-r $input_filename) {
84	print STDERR "Error: unable to open $input_filename for reading\n";
85	exit(1);
86	}
87
88	# Deduce filenames
89	my ($tailname,$dirname,$suffix)
90	= File::Basename::fileparse($input_filename,'\..+');
91	my $output_filestem = &util::filename_cat($dirname,"$tailname");
92
93	if ($input_type eq "")
94	{
95	$input_type = substr($suffix,1,length($suffix)-1);
96	}
97
98	# Change to temporary working directory
99	my $stored_dir = cwd();
100	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
101
102	# Select convert utility
103	if (!defined $input_type) {
104	print STDERR "Error: No filename extension or input type defined\n";
105	exit(1);
106	}
107	elsif ($input_type eq "doc") {
108	print &convertDOC($input_filename, $output_filestem, $output_type);
109	print "\n";
110	}
111	elsif ($input_type eq "rtf") {
112	print &convertRTF($input_filename, $output_filestem, $output_type);
113	print "\n";
114	}
115	elsif ($input_type eq "pdf") {
116	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
117	print "\n";
118	}
119	elsif ($input_type eq "ps") {
120	print &convertPS($input_filename, $output_filestem, $output_type);
121	print "\n";
122	}
123	else {
124	print STDERR "Error: Unable to convert type '$input_type'\n";
125	exit(1);
126	}
127
128	# restore to original working directory
129	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
130
131	}
132
133	&main(@ARGV);
134
135
136
137	# Document-type conversion fucntions
138	#
139	# The following functions attempt to convert documents from their
140	# input type to the specified output type. If no output type was
141	# given, then they first attempt HTML, and then TEXT.
142	#
143	# Each returns the output type ("html" or "text") or "fail" if no
144	# conversion is possible.
145
146	# Convert a Microsoft word document
147
148	sub convertDOC {
149	($input_filename, $output_filestem, $output_type) = @_;
150
151	# Many .doc files are not in fact word documents!
152	my $realtype = &find_docfile_type($input_filename);
153
154	if ($realtype eq "word6" \|\| $realtype eq "word7" \|\| $realtype eq "word8") {
155	return &convertWord678($input_filename, $output_filestem, $output_type);
156	} elsif ($realtype eq "rtf") {
157	return &convertRTF($input_filename, $output_filestem, $output_type);
158	} else {
159	return &convertAnything($input_filename, $output_filestem, $output_type);
160	}
161	}
162
163	# Convert a Microsoft word 6/7/8 document
164
165	sub convertWord678 {
166	($input_filename, $output_filestem, $output_type) = @_;
167
168	my $success = 0;
169
170	# Attempt specialised conversion to HTML
171	if (!$output_type \|\| ($output_type =~ /html/i)) {
172	$success = &doc_to_html($input_filename, $output_filestem);
173	if ($success) {
174	return "html";
175	}
176	}
177
178	return &convertAnything($input_filename, $output_filestem, $output_type);
179	}
180
181
182	# Convert a Rich Text Format (RTF) file
183
184	sub convertRTF {
185	($input_filename, $output_filestem, $output_type) = @_;
186
187	my $success = 0;
188
189	# Attempt specialised conversion to HTML
190	if (!$output_type \|\| ($output_type =~ /html/i)) {
191	$success = &rtf_to_html($input_filename, $output_filestem);
192	if ($success) {
193	return "html";
194	}
195	}
196
197	return &convertAnything($input_filename, $output_filestem, $output_type);
198	}
199
200
201	# Convert an unidentified file
202
203	sub convertAnything {
204	($input_filename, $output_filestem, $output_type) = @_;
205
206	my $success = 0;
207
208	# Attempt simple conversion to HTML
209	if (!$output_type \|\| ($output_type =~ /html/i)) {
210	$success = &any_to_html($input_filename, $output_filestem);
211	if ($success) {
212	return "html";
213	}
214	}
215
216	# Convert to text
217	if (!$output_type \|\| ($output_type =~ /text/i)) {
218	$success = any_to_text($input_filename, $output_filestem);
219	if ($success) {
220	return "text";
221	}
222	}
223	return "fail";
224	}
225
226
227
228	# Convert an Adobe PDF document
229
230	sub convertPDF {
231	($dirname, $input_filename, $output_filestem, $output_type) = @_;
232
233	my $success = 0;
234
235	# Attempt conversion to HTML
236	if (!$output_type \|\| ($output_type =~ /html/i)) {
237	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
238	if ($success) {
239	return "html";
240	}
241	}
242
243	# Attempt conversion to TEXT
244	if (!$output_type \|\| ($output_type =~ /text/i)) {
245	$success = &pdf_to_text($input_filename, $output_filestem);
246	if ($success) {
247	return "text";
248	}
249	}
250
251	return "fail";
252
253	}
254
255
256	# Convert an Adobe PostScript document
257
258	sub convertPS {
259	($input_filename, $output_filestem, $output_type) = @_;
260
261	my $success = 0;
262
263	# Attempt conversion to TEXT
264	if (!$output_type \|\| ($output_type =~ /text/i)) {
265	$success = &ps_to_text($input_filename, $output_filestem);
266	if ($success) {
267	return "text";
268	}
269	}
270
271	return "fail";
272
273	}
274
275
276	# Find the real type of a .doc file
277	#
278	# We seem to have alot of files with a .dco extension that are .rtf
279	# files or Word 5 files. This function attempts to tell the difference.
280
281	sub find_docfile_type {
282	($input_filename) = @_;
283
284	open(CHK, "<$input_filename");
285	binmode(CHK);
286	my $line = "";
287	my $first = 1;
288
289	while (<CHK>) {
290
291	$line = $_;
292
293	if ($first) {
294	# check to see if this is an rtf file
295	if ($line =~ /^\{\\rtf/) {
296	close(CHK);
297	return "rtf";
298	}
299	}
300
301	# is this is a word 6/7/8 document?
302	if ($line =~ /Word\.Document\.([678])/) {
303	close(CHK);
304	return "word$1";
305	}
306
307	$first = 0;
308
309	}
310
311	return "unknown";
312	}
313
314
315
316	# Specific type-to-type conversions
317	#
318	# Each of the following functions attempts to convert a document from
319	# a specific format to another. If they succeed yhey return 1 and leave
320	# the output document(s) in the appropriate place; if they fail they
321	# return 0 and delete any working files.
322
323
324	# Attempt to convert a word document to html with the wv program
325
326	sub doc_to_html {
327	($input_filename, $output_filestem) = @_;
328
329	my $wvWare = "";
330	my $wv_conf = "";
331
332	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
333	$wvWare = "$ENV{'GSDLHOME'}\\bin\\windows\\wvWare.exe";
334	$wv_conf = "$ENV{'GSDLHOME'}\\bin\\windows\\wvHtml.xml";
335
336	} else {
337	# formulate the command
338	my $wv_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", "wv");
339	$wv_conf = &util::filename_cat($wv_home, "lib", "wv", "wvHtml.xml");
340	$wvWare = &util::filename_cat($wv_home, "bin", "wvWare");
341	}
342	return 0 unless (-e "$wvWare");
343
344	$cmd = "";
345	if ($timeout) {$cmd = "ulimit -t $timeout;";}
346	$cmd .= "$wvWare --charset utf-8 --config $wv_conf";
347	$cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
348
349	# execute the command
350	if (system($cmd)>0)
351	{
352	print STDERR "Error executing wv converter: $!. Continuing...\n";
353	}
354
355	# Was the conversion successful?
356	if (-e "$output_filestem.html") {
357	open(TMP, "$output_filestem.html");
358	$line = <TMP>;
359	close(TMP);
360	if ($line && $line =~ /DOCTYPE HTML/) {
361	&util::rm("$output_filestem.err");
362	return 1;
363	} else {
364	# An error of some sort occurred
365	&util::rm("$output_filestem.html");
366	&util::rm("$output_filestem.err");
367	}
368	}
369
370	return 0;
371	}
372
373
374	# Attempt to convert an RTF document to html with rtftohtml
375	#
376	# rtf2html isn't distributed with Greenstone because it is not
377	# distributed under teh GPL. If you know of a better solution,
378	# please let me know.
379
380	sub rtf_to_html {
381	($input_filename, $output_filestem) = @_;
382
383	# formulate the command
384	my $r_cmd = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix",
385	"rtf2html", "rtf2html", "rtf2html");
386	$r_cmd = "rtf2html" unless (-e "$r_cmd");
387	return 0 unless (-e "$r_cmd");
388	$cmd = "";
389	if ($timeout) {$cmd = "ulimit -t $timeout;";}
390	$cmd .= "$r_cmd";
391	$cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
392
393	# execute the command
394	if (system($cmd)>0)
395	{
396	print STDERR "Error executing rtf converter: $!. Continuing...\n";
397	}
398
399	# Was the conversion successful?
400	if (-e "$output_filestem.html") {
401	open(TMP, "$output_filestem.html");
402	$line = <TMP>;
403	close(TMP);
404	if ($line && $line =~ /DOCTYPE HTML/) {
405	&util::rm("$output_filestem.err");
406	return 1;
407	} else {
408	# An error of some sort occurred
409	&util::rm("$output_filestem.html");
410	&util::rm("$output_filestem.err");
411	}
412	}
413	return 0;
414	}
415
416
417	# Convert a pdf file to html with the pdftohtml command
418
419	sub pdf_to_html {
420	($dirname, $input_filename, $output_filestem) = @_;
421
422	$cmd = "";
423	if ($timeout) {$cmd = "ulimit -t $timeout;";}
424	$cmd .= "pdftohtml.pl -F ";
425	$cmd .= " \"$input_filename\" \"$output_filestem\"";
426
427	if (system($cmd)>0)
428	{
429	print STDERR "Error executing $cmd: $!\n";
430	return 0;
431	}
432
433	# make sure the converter made something
434	if (! -e "$output_filestem.html")
435	{
436	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
437	# print out the converters std err, if any
438	if (-e "$output_filestem.err") {
439	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
440	print STDERR "pdftohtml:\n";
441	while (<ERRLOG>) {
442	print STDERR "$_";
443	}
444	close ERRLOG;
445	}
446	return 0;
447	}
448
449	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
450	return 1;
451	}
452
453	# Convert a PDF file to text with the pdftotext command
454
455	sub pdf_to_text {
456	($dirname, $input_filename, $output_filestem) = @_;
457
458	$cmd = "pdftotext \"$input_filename\" > \"$output_filestem.text\"";
459	$cmd .= " 2> $output_filestem.err";
460
461	if (system($cmd)>0)
462	{
463	print STDERR "Error executing $cmd: $!\n";
464	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
465	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
466	return 0;
467	}
468
469	# make sure the converter made something
470	if (! -e "$output_filestem.html")
471	{
472	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
473	# print out the converters std err, if any
474	if (-e "$output_filestem.err") {
475	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
476	print STDERR "pdftotext:\n";
477	while (<ERRLOG>) {
478	print STDERR "$_";
479	}
480	close ERRLOG;
481	}
482	return 0;
483	}
484
485	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
486	return 1;
487	}
488
489	# Convert a PostScript document to text with ps2ascii
490
491	sub ps_to_text {
492	($input_filename, $output_filestem) = @_;
493
494	my $cmd = "ps2ascii \"$input_filename\" > \"$output_filestem.text\"";
495	$cmd .= " 2> $output_filestem.err";
496
497	if (system($cmd)>0)
498	{
499	print STDERR "Error executing $cmd: $!\n";
500	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
501	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
502	return 0;
503	}
504
505	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
506	return 1;
507	}
508
509
510	# Convert any file to HTML with a crude perl implementation of the
511	# UNIX strings command.
512
513	sub any_to_html {
514	($input_filename, $output_filestem) = @_;
515
516	# First generate a text file
517	return 0 unless (&any_to_text($input_filename, $output_filestem));
518
519	# create an HTML file from the text file
520	open(TEXT, "<$output_filestem.text");
521	open(HTML, ">$output_filestem.html");
522
523	print HTML '<html><head>
524	<META HTTP-EQUIV="Content-Type" CONTENT="text/html">
525	<META NAME="GENERATOR" CONTENT="Greenstone any_to_html">
526	</head><body>';
527	print HTML "\n\n";
528
529	while (<TEXT>) {
530	print HTML "<p> ", $_;
531
532	}
533	print HTML "\n</body></html>\n";
534
535	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
536	return 1;
537	}
538
539	# Convert any file to TEXT with a crude perl implementation of the
540	# UNIX strings command.
541
542	sub any_to_text {
543	($input_filename, $output_filestem) = @_;
544
545	open(IN, "<$input_filename");
546	binmode(IN);
547	open(OUT, ">$output_filestem.text");
548
549	my ($line);
550	my $dgcount = 0;
551	while (<IN>) {
552	$line = $_;
553
554	# delete anything that isn't a printable character
555	$line =~ s/[^\040-\176]+/\n/sg;
556
557	# delete any string less than 10 characters long
558	$line =~ s/^.{0,9}$/\n/mg;
559	while ($line =~ /^.{1,9}$/m) {
560	$line =~ s/^.{0,9}$/\n/mg;
561	$line =~ s/\n+/\n/sg;
562	}
563
564	# remove extraneous whitespace
565	$line =~ s/\n+/\n/gs;
566	$line =~ s/^\n//gs;
567
568	# output whatever is left
569	if ($line =~ /[^\n ]/) {
570	print OUT $line;
571	}
572	}
573	return 1;
574	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: