Context Navigation

source: trunk/gsdl/bin/script/gsConvert.pl@ 1928

Last change on this file since 1928 was 1928, checked in by sjboddie, 23 years ago

Added: pdftohtml.pl - Perl script that handles conversion of PDF documents into

HTML. Called by gsConvert.pl in sub pdf_to_html.

Modified: gsConvert.pl - Perl script that converts various formats (MSWord,

RTF, PDF, PS) into HTML when importing the collection.

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 15.7 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML ot TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. These are usually found in the
31	# $GSDLHOME/packages directory.
32	#
33	# Currently, we can convert Microsoft Word and Adobe PDF using specialised
34	# conversion utilities. We can convery any file to text with a perl
35	# implementation of the UNIX strings command.
36
37
38	BEGIN {
39	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
41	}
42
43	use parsargv;
44	use util;
45	use Cwd;
46	use File::Basename;
47
48
49	sub print_usage
50	{
51	print STDERR "Usage: $0 [options] filename\n";
52	print STDERR "Options are:\n\t-type\tdoc\|pdf\n\t-output\thtml\|text\n";
53	print STDERR "\t-timeout\t<max cpu seconds>\n";
54	exit(1);
55	}
56
57
58	sub main
59	{
60	my (@ARGV) = @_;
61	my ($input_type,$output_type,$verbose,$timeout);
62
63	$timeout = 0;
64	# read command-line arguments
65	if (!parsargv::parse(\@ARGV,
66	'type/(doc\|pdf)/', \$input_type,
67	'output/(html\|text)/', \$output_type,
68	'timeout/\d+/0',\$timeout,
69	'verbose/\d+/0', \$verbose))
70	{
71	print_usage();
72	}
73
74	# Make sure the input file exists and can be opened for reading
75	if (scalar(@ARGV!=1)) {
76	print_usage();
77	}
78
79	my $input_filename = $ARGV[0];
80	if (!-r $input_filename) {
81	print STDERR "Error: unable to open $input_filename for reading\n";
82	exit(1);
83	}
84
85	# Deduce filenames
86	my ($tailname,$dirname,$suffix)
87	= File::Basename::fileparse($input_filename,'\..+');
88	my $output_filestem = &util::filename_cat($dirname,"$tailname");
89
90	if ($input_type eq "")
91	{
92	$input_type = substr($suffix,1,length($suffix)-1);
93	}
94
95	# Change to temporary working directory
96	my $stored_dir = cwd();
97	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
98
99	# Select convert utility
100	if (!defined $input_type) {
101	print STDERR "Error: No filename extension or input type defined\n";
102	exit(1);
103	}
104	elsif ($input_type eq "doc") {
105	print &convertDOC($input_filename, $output_filestem, $output_type);
106	print "\n";
107	}
108	elsif ($input_type eq "rtf") {
109	print &convertRTF($input_filename, $output_filestem, $output_type);
110	print "\n";
111	}
112	elsif ($input_type eq "pdf") {
113	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
114	print "\n";
115	}
116	elsif ($input_type eq "ps") {
117	print &convertPS($input_filename, $output_filestem, $output_type);
118	print "\n";
119	}
120	else {
121	print STDERR "Error: Unable to convert type '$input_type'\n";
122	exit(1);
123	}
124
125	# restore to original working directory
126	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
127
128	}
129
130	&main(@ARGV);
131
132
133
134	# Document-type conversion fucntions
135	#
136	# The following functions attempt to convert documents from their
137	# input type to the specified output type. If no output type was
138	# given, then they first attempt HTML, and then TEXT.
139	#
140	# Each returns the output type ("html" or "text") or "fail" if no
141	# conversion is possible.
142
143	# Convert a Microsoft word document
144
145	sub convertDOC {
146	($input_filename, $output_filestem, $output_type) = @_;
147
148	# Many .doc files are not in fact word documents!
149	my $realtype = &find_docfile_type($input_filename);
150
151	if ($realtype eq "word6" \|\| $realtype eq "word7" \|\| $realtype eq "word8") {
152	return &convertWord678($input_filename, $output_filestem, $output_type);
153	} elsif ($realtype eq "rtf") {
154	return &convertRTF($input_filename, $output_filestem, $output_type);
155	} else {
156	return &convertAnything($input_filename, $output_filestem, $output_type);
157	}
158	}
159
160	# Convert a Microsoft word 6/7/8 document
161
162	sub convertWord678 {
163	($input_filename, $output_filestem, $output_type) = @_;
164
165	my $success = 0;
166
167	# Attempt specialised conversion to HTML
168	if (!$output_type \|\| ($output_type =~ /html/i)) {
169	print STDERR "I am about to call doc_to_html...\n";
170	$success = &doc_to_html($input_filename, $output_filestem);
171	if ($success) {
172	return "html";
173	}
174	}
175
176	return &convertAnything($input_filename, $output_filestem, $output_type);
177	}
178
179
180	# Convert a Rich Text Format (RTF) file
181
182	sub convertRTF {
183	($input_filename, $output_filestem, $output_type) = @_;
184
185	my $success = 0;
186
187	# Attempt specialised conversion to HTML
188	if (!$output_type \|\| ($output_type =~ /html/i)) {
189	$success = &rtf_to_html($input_filename, $output_filestem);
190	if ($success) {
191	return "html";
192	}
193	}
194
195	return &convertAnything($input_filename, $output_filestem, $output_type);
196	}
197
198
199	# Convert an unidentified file
200
201	sub convertAnything {
202	($input_filename, $output_filestem, $output_type) = @_;
203
204	my $success = 0;
205
206	# Attempt simple conversion to HTML
207	if (!$output_type \|\| ($output_type =~ /html/i)) {
208	$success = &any_to_html($input_filename, $output_filestem);
209	if ($success) {
210	return "html";
211	}
212	}
213
214	# Convert to text
215	if (!$output_type \|\| ($output_type =~ /text/i)) {
216	$success = any_to_text($input_filename, $output_filestem);
217	if ($success) {
218	return "text";
219	}
220	}
221	return "fail";
222	}
223
224
225
226	# Convert an Adobe PDF document
227
228	sub convertPDF {
229	($dirname, $input_filename, $output_filestem, $output_type) = @_;
230
231	my $success = 0;
232
233	# Attempt conversion to HTML
234	if (!$output_type \|\| ($output_type =~ /html/i)) {
235	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
236	if ($success) {
237	return "html";
238	}
239	}
240
241	# Attempt conversion to TEXT
242	if (!$output_type \|\| ($output_type =~ /text/i)) {
243	$success = &pdf_to_text($input_filename, $output_filestem);
244	if ($success) {
245	return "text";
246	}
247	}
248
249	return "fail";
250
251	}
252
253
254	# Convert an Adobe PostScript document
255
256	sub convertPS {
257	($input_filename, $output_filestem, $output_type) = @_;
258
259	my $success = 0;
260
261	# Attempt conversion to TEXT
262	if (!$output_type \|\| ($output_type =~ /text/i)) {
263	$success = &ps_to_text($input_filename, $output_filestem);
264	if ($success) {
265	return "text";
266	}
267	}
268
269	return "fail";
270
271	}
272
273
274	# Find the real type of a .doc file
275	#
276	# We seem to have alot of files with a .dco extension that are .rtf
277	# files or Word 5 files. This function attempts to tell the difference.
278
279	sub find_docfile_type {
280	($input_filename) = @_;
281
282	open(TMP, ">temp.txt");
283	binmode(TMP);
284	open(CHK, "<$input_filename");
285	binmode(CHK);
286	my $line = "";
287	my $first = 1;
288
289	while (<CHK>) {
290
291	$line = $_;
292	print TMP "$line\n\n";
293	if ($first) {
294	# check to see if this is an rtf file
295	if ($line =~ /^\{\\rtf/) {
296	close(CHK);
297	return "rtf";
298	}
299	}
300
301	# is this is a word 6/7/8 document?
302	if ($line =~ /Word\.Document\.([678])/) {
303	close(CHK);
304	return "word$1";
305	}
306
307	$first = 0;
308
309	}
310
311	return "unknown";
312	}
313
314
315
316	# Specific type-to-type conversions
317	#
318	# Each of the following functions attempts to convert a document from
319	# a specific format to another. If they succeed yhey return 1 and leave
320	# the output document(s) in the appropriate place; if they fail they
321	# return 0 and delete any working files.
322
323
324	# Attempt to convert a word document to html with the wv program
325
326	sub doc_to_html {
327	($input_filename, $output_filestem) = @_;
328
329	my $wvWare = "";
330	my $wv_conf = "";
331
332	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
333	$wvWare = "$ENV{'GSDLHOME'}\\bin\\windows\\wvWare.exe";
334	$wv_conf = "$ENV{'GSDLHOME'}\\bin\\windows\\wvHtml.xml";
335
336	} else {
337	# formulate the command
338	my $wv_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", "wv");
339	$wv_conf = &util::filename_cat($wv_home, "lib", "wv", "wvHtml.xml");
340	$wvWare = &util::filename_cat($wv_home, "bin", "wvWare");
341	}
342	return 0 unless (-e "$wvWare");
343
344	$cmd = "";
345	if ($timeout) {$cmd = "ulimit -t $timeout;";}
346	$cmd .= "$wvWare --charset utf-8 --config $wv_conf";
347	$cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
348
349	# execute the command
350	if (system($cmd)>0)
351	{
352	print STDERR "Error executing wv converter: $!. Continuing...\n";
353	}
354
355	# Was the conversion successful?
356	if (-e "$output_filestem.html") {
357	open(TMP, "$output_filestem.html");
358	$line = <TMP>;
359	close(TMP);
360	if ($line && $line =~ /DOCTYPE HTML/) {
361	&util::rm("$output_filestem.err");
362	return 1;
363	} else {
364	# An error of some sort occurred
365	&util::rm("$output_filestem.html");
366	&util::rm("$output_filestem.err");
367	}
368	}
369
370	return 0;
371	}
372
373
374	# Attempt to convert an RTF document to html with rtftohtml
375	#
376	# rtf2html isn't distributed with Greenstone because it is not
377	# distributed under teh GPL. If you know of a better solution,
378	# please let me know.
379
380	sub rtf_to_html {
381	($input_filename, $output_filestem) = @_;
382
383	# formulate the command
384	my $r_cmd = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix",
385	"rtf2html", "rtf2html", "rtf2html");
386	$r_cmd = "rtf2html" unless (-e "$r_cmd");
387	return 0 unless (-e "$r_cmd");
388	$cmd = "";
389	if ($timeout) {$cmd = "ulimit -t $timeout;";}
390	$cmd .= "$r_cmd";
391	$cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
392
393	# execute the command
394	if (system($cmd)>0)
395	{
396	print STDERR "Error executing rtf converter: $!. Continuing...\n";
397	}
398
399	# Was the conversion successful?
400	if (-e "$output_filestem.html") {
401	open(TMP, "$output_filestem.html");
402	$line = <TMP>;
403	close(TMP);
404	if ($line && $line =~ /DOCTYPE HTML/) {
405	&util::rm("$output_filestem.err");
406	return 1;
407	} else {
408	# An error of some sort occurred
409	&util::rm("$output_filestem.html");
410	&util::rm("$output_filestem.err");
411	}
412	}
413	return 0;
414	}
415
416
417	# Convert a pdf file to html with the pdftohtml command
418
419	sub pdf_to_html {
420	($dirname, $input_filename, $output_filestem) = @_;
421
422	$cmd = "";
423	if ($timeout) {$cmd = "ulimit -t $timeout;";}
424	$cmd .= "pdftohtml.pl -F ";
425	$cmd .= " \"$input_filename\" \"$output_filestem\"";
426
427	if (system($cmd)>0)
428	{
429	print STDERR "Error executing $cmd: $!\n";
430	return 0;
431	}
432
433	# make sure the converter made something
434	if (! -e "$output_filestem.html")
435	{
436	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
437	# print out the converters std err, if any
438	if (-e "$output_filestem.err") {
439	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
440	print STDERR "pdftohtml:\n";
441	while (<ERRLOG>) {
442	print STDERR "$_";
443	}
444	close ERRLOG;
445	}
446	return 0;
447	}
448
449	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
450	return 1;
451	}
452
453	# Convert a PDF file to text with the pdftotext command
454
455	sub pdf_to_text {
456	($dirname, $input_filename, $output_filestem) = @_;
457
458	$cmd = "pdftotext \"$input_filename\" > \"$output_filestem.text\"";
459	$cmd .= " 2> $output_filestem.err";
460
461	if (system($cmd)>0)
462	{
463	print STDERR "Error executing $cmd: $!\n";
464	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
465	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
466	return 0;
467	}
468
469	# make sure the converter made something
470	if (! -e "$output_filestem.html")
471	{
472	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
473	# print out the converters std err, if any
474	if (-e "$output_filestem.err") {
475	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
476	print STDERR "pdftotext:\n";
477	while (<ERRLOG>) {
478	print STDERR "$_";
479	}
480	close ERRLOG;
481	}
482	return 0;
483	}
484
485	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
486	return 1;
487	}
488
489	# Convert a PostScript document to text with ps2ascii
490
491	sub ps_to_text {
492	($input_filename, $output_filestem) = @_;
493
494	my $cmd = "ps2ascii \"$input_filename\" > \"$output_filestem.text\"";
495	$cmd .= " 2> $output_filestem.err";
496	if (system($cmd)>0)
497	{
498	print STDERR "Error executing $cmd: $!\n";
499	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
500	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
501
502	# Fine then. We'll just do a lousy job by ourselves...
503	# Based on code nicked from:
504	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
505	#
506	print STDERR "Attempting to strip text from postscript.\n";
507	my $errorcode=0;
508	open (IN, "$input_filename")
509	\|\| ($errorcode=1, warn "Couldn't read file: $!");
510	open (OUT, ">$output_filestem.text")
511	\|\| ($errorcode=1, warn "Couldn't write file: $!");
512	if ($errorcode) {print STDERR "errors\n";return 0;}
513
514	my $in_a_sentence=0;
515	while (<IN>) {
516	if (/^[^\(\)]+$/ && !$in_a_sentence) {next ;} # no brackets in line
517	# attempt to add whitespace between different lines...
518	s/F.?\(/\( /g; # this might break up some other words though...
519	### remove all postscript control data
520	if (!$in_a_sentence) {
521	s/^[^\(\)]*?\(//;} # rm start of line up to first open bracket
522	s/\\\(/\{/g;s/\\\)/\}/g ; # change quoted braces
523	s/\)([^\(\)])*?\(//g ; # close bracket up to next open unquoted bracket
524	if (s/\)[^\(\)]*?$//g) # last close bracket to end of line
525	{$in_a_sentence=0;chomp;}
526	if (s/\\$//) # if line is a continuation
527	{$in_a_sentence=1;chomp;}
528	s/^$//g ; # remove empty lines
529	### ligatures have special characters...
530	s/\\214/fi/g;
531	s/\\215/fl/g;
532	print OUT "$_";
533	}
534	close IN; close OUT;
535	}
536	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
537	return 1;
538	}
539
540
541	# Convert any file to HTML with a crude perl implementation of the
542	# UNIX strings command.
543
544	sub any_to_html {
545	($input_filename, $output_filestem) = @_;
546
547	# First generate a text file
548	return 0 unless (&any_to_text($input_filename, $output_filestem));
549
550	# create an HTML file from the text file
551	open(TEXT, "<$output_filestem.text");
552	open(HTML, ">$output_filestem.html");
553
554	print HTML '<html><head>
555	<META HTTP-EQUIV="Content-Type" CONTENT="text/html">
556	<META NAME="GENERATOR" CONTENT="Greenstone any_to_html">
557	</head><body>';
558	print HTML "\n\n";
559
560	while (<TEXT>) {
561	print HTML "<p> ", $_;
562
563	}
564	print HTML "\n</body></html>\n";
565
566	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
567	return 1;
568	}
569
570	# Convert any file to TEXT with a crude perl implementation of the
571	# UNIX strings command.
572
573	sub any_to_text {
574	($input_filename, $output_filestem) = @_;
575
576	#open(TEMP, ">temp.txt");
577	open(IN, "<$input_filename");
578	binmode(IN);
579	open(OUT, ">$output_filestem.text");
580
581	my ($line);
582	my $dgcount = 0;
583	while (<IN>) {
584	$line = $_;
585
586	# delete anything that isn't a printable character
587	#print TEMP $line;
588	$line =~ s/[^\040-\176]+/\n/sg;
589
590	# delete any string less than 10 characters long
591	$line =~ s/^.{0,9}$/\n/mg;
592	while ($line =~ /^.{1,9}$/m) {
593	$line =~ s/^.{0,9}$/\n/mg;
594	$line =~ s/\n+/\n/sg;
595	}
596
597	# remove extraneous whitespace
598	$line =~ s/\n+/\n/gs;
599	$line =~ s/^\n//gs;
600
601	# output whatever is left
602	if ($line =~ /[^\n ]/) {
603	print OUT $line;
604	}
605	}
606	return 1;
607	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: