Context Navigation

source: trunk/gsdl/bin/script/gsConvert.pl@ 1734

Last change on this file since 1734 was 1734, checked in by jrm21, 23 years ago
For postscript, fall back to some simple text extraction if ps2ascii isn't found. (This should be portable as it is perl). It won't be formatted though, so currently is only useful for indexing - users will have to view the postscript for now...
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 16.7 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML ot TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. These are usually found in the
31	# $GSDLHOME/packages directory.
32	#
33	# Currently, we can convert Microsoft Word and Adobe PDF using specialised
34	# conversion utilities. We can convery any file to text with a perl
35	# implementation of the UNIX strings command.
36
37
38	BEGIN {
39	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
41	}
42
43	use parsargv;
44	use util;
45	use Cwd;
46	use File::Basename;
47
48
49	sub print_usage
50	{
51	print STDERR "Usage: $0 [options] filename\n";
52	print STDERR "Options are:\n\t-type\tdoc\|pdf\n\t-output\thtml\|text\n";
53	print STDERR "\t-timeout\t<max cpu seconds>\n";
54	exit(1);
55	}
56
57
58	sub main
59	{
60	my (@ARGV) = @_;
61	my ($input_type,$output_type,$verbose,$timeout);
62
63	$timeout = 0;
64	# read command-line arguments
65	if (!parsargv::parse(\@ARGV,
66	'type/(doc\|pdf)/', \$input_type,
67	'output/(html\|text)/', \$output_type,
68	'timeout/\d+/0',\$timeout,
69	'verbose/\d+/0', \$verbose))
70	{
71	print_usage();
72	}
73
74	# Make sure the input file exists and can be opened for reading
75	if (scalar(@ARGV!=1)) {
76	print_usage();
77	}
78	my $input_filename = $ARGV[0];
79	if (!-r $input_filename) {
80	print STDERR "Error: unable to open $input_filename for reading\n";
81	exit(1);
82	}
83
84	# Deduce filenames
85	my ($tailname,$dirname,$suffix)
86	= File::Basename::fileparse($input_filename,'\..+');
87	my $output_filestem = &util::filename_cat($dirname,"$tailname");
88
89	if ($input_type eq "")
90	{
91	$input_type = substr($suffix,1,length($suffix)-1);
92	}
93
94	# Change to temporary working directory
95	my $stored_dir = cwd();
96	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
97
98	# Select convert utility
99	if (!defined $input_type) {
100	print STDERR "Error: No filename extension or input type defined\n";
101	exit(1);
102	}
103	elsif ($input_type eq "doc") {
104	print STDERR "I recognise this to be a Word document...\n"; # remove
105	print &convertDOC($input_filename, $output_filestem, $output_type);
106	print "\n";
107	}
108	elsif ($input_type eq "rtf") {
109	print &convertRTF($input_filename, $output_filestem, $output_type);
110	print "\n";
111	}
112	elsif ($input_type eq "pdf") {
113	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
114	print "\n";
115	}
116	elsif ($input_type eq "ps") {
117	print &convertPS($input_filename, $output_filestem, $output_type);
118	print "\n";
119	}
120	else {
121	print STDERR "Error: Unable to convert type '$input_type'\n";
122	exit(1);
123	}
124
125	# restore to original working directory
126	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
127
128	}
129
130	&main(@ARGV);
131
132
133
134	# Document-type conversion fucntions
135	#
136	# The following functions attempt to convert documents from their
137	# input type to the specified output type. If no output type was
138	# given, then they first attempt HTML, and then TEXT.
139	#
140	# Each returns the output type ("html" or "text") or "fail" if no
141	# conversion is possible.
142
143	# Convert a Microsoft word document
144
145	sub convertDOC {
146	($input_filename, $output_filestem, $output_type) = @_;
147
148	# Many .doc files are not in fact word documents!
149	my $realtype = &find_docfile_type($input_filename);
150
151	print STDERR "The real type of this Word document is $realtype\n"; # remove
152
153	if ($realtype eq "word6" \|\| $realtype eq "word7" \|\| $realtype eq "word8") {
154	print STDERR "I recognise this to be a word678 document...\n"; # remove
155	return &convertWord678($input_filename, $output_filestem, $output_type);
156	} elsif ($realtype eq "rtf") {
157	return &convertRTF($input_filename, $output_filestem, $output_type);
158	} else {
159	return &convertAnything($input_filename, $output_filestem, $output_type);
160	}
161	}
162
163	# Convert a Microsoft word 6/7/8 document
164
165	sub convertWord678 {
166	($input_filename, $output_filestem, $output_type) = @_;
167
168	my $success = 0;
169
170	# Attempt specialised conversion to HTML
171	if (!$output_type \|\| ($output_type =~ /html/i)) {
172	print STDERR "I am about to call doc_to_html...\n";
173	$success = &doc_to_html($input_filename, $output_filestem);
174	if ($success) {
175	return "html";
176	}
177	}
178
179	return &convertAnything($input_filename, $output_filestem, $output_type);
180	}
181
182
183	# Convert a Rich Text Format (RTF) file
184
185	sub convertRTF {
186	($input_filename, $output_filestem, $output_type) = @_;
187
188	my $success = 0;
189
190	# Attempt specialised conversion to HTML
191	if (!$output_type \|\| ($output_type =~ /html/i)) {
192	$success = &rtf_to_html($input_filename, $output_filestem);
193	if ($success) {
194	return "html";
195	}
196	}
197
198	return &convertAnything($input_filename, $output_filestem, $output_type);
199	}
200
201
202	# Convert an unidentified file
203
204	sub convertAnything {
205	($input_filename, $output_filestem, $output_type) = @_;
206
207	my $success = 0;
208
209	# Attempt simple conversion to HTML
210	if (!$output_type \|\| ($output_type =~ /html/i)) {
211	$success = &any_to_html($input_filename, $output_filestem);
212	if ($success) {
213	return "html";
214	}
215	}
216
217	# Convert to text
218	if (!$output_type \|\| ($output_type =~ /text/i)) {
219	$success = any_to_text($input_filename, $output_filestem);
220	if ($success) {
221	return "text";
222	}
223	}
224	return "fail";
225	}
226
227
228
229	# Convert an Adobe PDF document
230
231	sub convertPDF {
232	($dirname, $input_filename, $output_filestem, $output_type) = @_;
233
234	my $success = 0;
235
236	# Attempt conversion to HTML
237	if (!$output_type \|\| ($output_type =~ /html/i)) {
238	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
239	if ($success) {
240	return "html";
241	}
242	}
243
244	# Attempt conversion to TEXT
245	if (!$output_type \|\| ($output_type =~ /text/i)) {
246	$success = &pdf_to_text($input_filename, $output_filestem);
247	if ($success) {
248	return "text";
249	}
250	}
251
252	return "fail";
253
254	}
255
256
257	# Convert an Adobe PostScript document
258
259	sub convertPS {
260	($input_filename, $output_filestem, $output_type) = @_;
261
262	my $success = 0;
263
264	# Attempt conversion to TEXT
265	if (!$output_type \|\| ($output_type =~ /text/i)) {
266	$success = &ps_to_text($input_filename, $output_filestem);
267	if ($success) {
268	return "text";
269	}
270	}
271
272	return "fail";
273
274	}
275
276
277	# Find the real type of a .doc file
278	#
279	# We seem to have alot of files with a .dco extension that are .rtf
280	# files or Word 5 files. This function attempts to tell the difference.
281
282	sub find_docfile_type {
283	($input_filename) = @_;
284
285	open(TMP, ">temp.txt");
286	binmode(TMP);
287	open(CHK, "<$input_filename");
288	binmode(CHK);
289	my $line = "";
290	my $first = 1;
291
292	while (<CHK>) {
293
294	$line = $_;
295	print TMP "$line\n\n";
296	if ($first) {
297	# check to see if this is an rtf file
298	if ($line =~ /^\{\\rtf/) {
299	close(CHK);
300	return "rtf";
301	}
302	}
303
304	# is this is a word 6/7/8 document?
305	if ($line =~ /Word\.Document\.([678])/) {
306	close(CHK);
307	return "word$1";
308	}
309
310	$first = 0;
311
312	}
313
314	return "unknown";
315	}
316
317
318
319	# Specific type-to-type conversions
320	#
321	# Each of the following functions attempts to convert a document from
322	# a specific format to another. If they succeed yhey return 1 and leave
323	# the output document(s) in the appropriate place; if they fail they
324	# return 0 and delete any working files.
325
326
327	# Attempt to convert a word document to html with the wv program
328
329	sub doc_to_html {
330	print STDERR "/;-DG I am in doc_to_html...\n"; # remove
331	($input_filename, $output_filestem) = @_;
332
333	my $wvWare = "";
334	my $wv_conf = "";
335	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
336	$wvWare = "$ENV{'GSDLHOME'}\\bin\\windows\\wvWare.exe";
337	$wv_conf = "$ENV{'GSDLHOME'}\\bin\\windows\\wvHtml.xml";
338
339	} else {
340	# formulate the command
341	my $wv_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", "wv");
342	$wv_conf = &util::filename_cat($wv_home, "lib", "wv", "wvHtml.xml");
343	$wvWare = &util::filename_cat($wv_home, "bin", "wvWare");
344	}
345	print STDERR "I am about to test if your file exists...\n";
346	return 0 unless (-e "$wvWare");
347	$cmd = "";
348	if ($timeout) {$cmd = "ulimit -t $timeout;";}
349	$cmd .= "$wvWare --charset utf-8 --config $wv_conf";
350	$cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
351
352	print STDERR "$cmd\n"; #remove
353
354	# execute the command
355	print STDERR system($cmd);
356	print STDERR "\n";
357	if (system($cmd)>0)
358	{
359	print STDERR "Error executing wv converter: $!. Continuing...\n";
360	}
361
362	# Was the conversion successful?
363	if (-e "$output_filestem.html") {
364	open(TMP, "$output_filestem.html");
365	$line = <TMP>;
366	close(TMP);
367	if ($line && $line =~ /DOCTYPE HTML/) {
368	&util::rm("$output_filestem.err");
369	return 1;
370	} else {
371	# An error of some sort occurred
372	&util::rm("$output_filestem.html");
373	&util::rm("$output_filestem.err");
374	}
375	}
376	print STDERR "/;-DG I am leaving doc_to_html...\n";
377	return 0;
378	}
379
380
381	# Attempt to convert an RTF document to html with rtftohtml
382	#
383	# rtf2html isn't distributed with Greenstone because it is not
384	# distributed under teh GPL. If you know of a better solution,
385	# please let me know.
386
387	sub rtf_to_html {
388	($input_filename, $output_filestem) = @_;
389
390	# formulate the command
391	my $r_cmd = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix",
392	"rtf2html", "rtf2html", "rtf2html");
393	$r_cmd = "rtf2html" unless (-e "$r_cmd");
394	return 0 unless (-e "$r_cmd");
395	$cmd = "";
396	if ($timeout) {$cmd = "ulimit -t $timeout;";}
397	$cmd .= "$r_cmd";
398	$cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
399
400	# execute the command
401	if (system($cmd)>0)
402	{
403	print STDERR "Error executing rtf converter: $!. Continuing...\n";
404	}
405
406	# Was the conversion successful?
407	if (-e "$output_filestem.html") {
408	open(TMP, "$output_filestem.html");
409	$line = <TMP>;
410	close(TMP);
411	if ($line && $line =~ /DOCTYPE HTML/) {
412	&util::rm("$output_filestem.err");
413	return 1;
414	} else {
415	# An error of some sort occurred
416	&util::rm("$output_filestem.html");
417	&util::rm("$output_filestem.err");
418	}
419	}
420	return 0;
421	}
422
423
424	# Convert a pdf file to html with the pdftohtml command
425
426	sub pdf_to_html {
427	($dirname, $input_filename, $output_filestem) = @_;
428
429	# formulate the command
430	my $p_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", "pdftohtml");
431	my $pdftohtml = &util::filename_cat($p_home, "pdftohtml_0_20", "pdftohtml.bin");
432	return 0 unless (-e "$pdftohtml");
433
434	$cmd = "";
435	if ($timeout) {$cmd = "ulimit -t $timeout;";}
436	$cmd .= "$pdftohtml -noframes";
437	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
438	$cmd .= " >\"$output_filestem.out\" 2>\"$output_filestem.err\"";
439
440	if (system($cmd)>0)
441	{
442	print STDERR "Error executing $cmd: $!\n";
443	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
444	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
445	return 0;
446	}
447
448	# make sure the converter made something
449	if (! -e "$output_filestem.html")
450	{
451	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
452	# print out the converters std err, if any
453	if (-e "$output_filestem.err") {
454	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
455	print STDERR "pdftohtml:\n";
456	while (<ERRLOG>) {
457	print STDERR "$_";
458	}
459	close ERRLOG;
460	}
461	return 0;
462	}
463
464	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
465	return 1;
466	}
467
468
469	# Convert a PDF file to text with the pdftotext command
470
471	sub pdf_to_text {
472	($dirname, $input_filename, $output_filestem) = @_;
473
474	$cmd = "pdftotext \"$input_filename\" > \"$output_filestem.text\"";
475	$cmd .= " 2> $output_filestem.err";
476
477	if (system($cmd)>0)
478	{
479	print STDERR "Error executing $cmd: $!\n";
480	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
481	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
482	return 0;
483	}
484
485	# make sure the converter made something
486	if (! -e "$output_filestem.html")
487	{
488	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
489	# print out the converters std err, if any
490	if (-e "$output_filestem.err") {
491	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
492	print STDERR "pdftotext:\n";
493	while (<ERRLOG>) {
494	print STDERR "$_";
495	}
496	close ERRLOG;
497	}
498	return 0;
499	}
500
501	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
502	return 1;
503	}
504
505
506	# Convert a PostScript document to text with ps2ascii
507
508	sub ps_to_text {
509	($input_filename, $output_filestem) = @_;
510
511	my $cmd = "ps2ascii \"$input_filename\" > \"$output_filestem.text\"";
512	$cmd .= " 2> $output_filestem.err";
513	if (system($cmd)>0)
514	{
515	print STDERR "Error executing $cmd: $!\n";
516	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
517	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
518
519	# Fine then. We'll just do a lousy job by ourselves...
520	# Based on code nicked from:
521	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
522	#
523	print STDERR "Attempting to strip text from postscript.\n";
524	my $errorcode=0;
525	open (IN, "$input_filename")
526	\|\| ($errorcode=1, warn "Couldn't read file: $!");
527	open (OUT, ">$output_filestem.text")
528	\|\| ($errorcode=1, warn "Couldn't write file: $!");
529	if ($errorcode) {print STDERR "errors\n";return 0;}
530
531	my $in_a_sentence=0;
532	while (<IN>) {
533	if (/^[^\(\)]+$/ && !$in_a_sentence) {next ;} # no brackets in line
534	# attempt to add whitespace between different lines...
535	s/F.?\(/\( /g; # this might break up some other words though...
536	### remove all postscript control data
537	if (!$in_a_sentence) {
538	s/^[^\(\)]*?\(//;} # rm start of line up to first open bracket
539	s/\\\(/\{/g;s/\\\)/\}/g ; # change quoted braces
540	s/\)([^\(\)])*?\(//g ; # close bracket up to next open unquoted bracket
541	if (s/\)[^\(\)]*?$//g) # last close bracket to end of line
542	{$in_a_sentence=0;chomp;}
543	if (s/\\$//) # if line is a continuation
544	{$in_a_sentence=1;chomp;}
545	s/^$//g ; # remove empty lines
546	### ligatures have special characters...
547	s/\\214/fi/g;
548	s/\\215/fl/g;
549	print OUT "$_";
550	}
551	close IN; close OUT;
552	}
553	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
554	return 1;
555	}
556
557
558	# Convert any file to HTML with a crude perl implementation of the
559	# UNIX strings command.
560
561	sub any_to_html {
562	print STDERR "/;-Dg I am in any_to_html!\n";
563	($input_filename, $output_filestem) = @_;
564
565	# First generate a text file
566	return 0 unless (&any_to_text($input_filename, $output_filestem));
567
568	# create an HTML file from the text file
569	open(TEXT, "<$output_filestem.text");
570	open(HTML, ">$output_filestem.html");
571
572	print HTML '<html><head>
573	<META HTTP-EQUIV="Content-Type" CONTENT="text/html">
574	<META NAME="GENERATOR" CONTENT="Greenstone any_to_html">
575	</head><body>';
576	print HTML "\n\n";
577
578	while (<TEXT>) {
579	print HTML "<p> ", $_;
580
581	}
582	print HTML "\n</body></html>\n";
583
584	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
585	print STDERR "/;-Dg I am getting out of any_to_html!\n";
586	return 1;
587	}
588
589	# Convert any file to TEXT with a crude perl implementation of the
590	# UNIX strings command.
591
592	sub any_to_text {
593	($input_filename, $output_filestem) = @_;
594
595	#open(TEMP, ">temp.txt");
596	open(IN, "<$input_filename");
597	binmode(IN);
598	open(OUT, ">$output_filestem.text");
599
600	my ($line);
601	my $dgcount = 0;
602	while (<IN>) {
603	$line = $_;
604
605	# delete anything that isn't a printable character
606	#print TEMP $line;
607	$line =~ s/[^\040-\176]+/\n/sg;
608
609	# delete any string less than 10 characters long
610	$line =~ s/^.{0,9}$/\n/mg;
611	while ($line =~ /^.{1,9}$/m) {
612	$line =~ s/^.{0,9}$/\n/mg;
613	$line =~ s/\n+/\n/sg;
614	}
615
616	# remove extraneous whitespace
617	$line =~ s/\n+/\n/gs;
618	$line =~ s/^\n//gs;
619
620	# output whatever is left
621	if ($line =~ /[^\n ]/) {
622	print OUT $line;
623	}
624	}
625	return 1;
626	}
627
628
629

Note: See TracBrowser for help on using the repository browser.

Download in other formats: