Context Navigation

source: trunk/gsdl/bin/script/gsConvert.pl@ 2031

Last change on this file since 2031 was 2031, checked in by jrm21, 23 years ago
Improved postscript to text handling a little bit better. Also, system($cmd) return value can be "-1", not just ">0"....
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 17.2 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML ot TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. These are usually found in the
31	# $GSDLHOME/packages directory.
32	#
33	# Currently, we can convert Microsoft Word and Adobe PDF using specialised
34	# conversion utilities. We can convery any file to text with a perl
35	# implementation of the UNIX strings command.
36
37
38	BEGIN {
39	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
41	}
42
43	use parsargv;
44	use util;
45	use Cwd;
46	use File::Basename;
47
48
49	sub print_usage
50	{
51	print STDERR "\n";
52	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
53	print STDERR " or text using third-party programs.\n\n";
54	print STDERR " usage: $0 [options] filename\n";
55	print STDERR " options:\n\t-type\tdoc\|pdf\n\t-output\thtml\|text\n";
56	print STDERR "\t-timeout\t<max cpu seconds>\n";
57	exit(1);
58	}
59
60
61	sub main
62	{
63	my (@ARGV) = @_;
64	my ($input_type,$output_type,$verbose,$timeout);
65
66	$timeout = 0;
67	# read command-line arguments
68	if (!parsargv::parse(\@ARGV,
69	'type/(doc\|pdf)/', \$input_type,
70	'output/(html\|text)/', \$output_type,
71	'timeout/\d+/0',\$timeout,
72	'verbose/\d+/0', \$verbose))
73	{
74	print_usage();
75	}
76
77	# Make sure the input file exists and can be opened for reading
78	if (scalar(@ARGV!=1)) {
79	print_usage();
80	}
81
82	my $input_filename = $ARGV[0];
83	if (!-r $input_filename) {
84	print STDERR "Error: unable to open $input_filename for reading\n";
85	exit(1);
86	}
87
88	# Deduce filenames
89	my ($tailname,$dirname,$suffix)
90	= File::Basename::fileparse($input_filename,'\..+');
91	my $output_filestem = &util::filename_cat($dirname,"$tailname");
92
93	if ($input_type eq "")
94	{
95	$input_type = substr($suffix,1,length($suffix)-1);
96	}
97
98	# Change to temporary working directory
99	my $stored_dir = cwd();
100	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
101
102	# Select convert utility
103	if (!defined $input_type) {
104	print STDERR "Error: No filename extension or input type defined\n";
105	exit(1);
106	}
107	elsif ($input_type eq "doc") {
108	print &convertDOC($input_filename, $output_filestem, $output_type);
109	print "\n";
110	}
111	elsif ($input_type eq "rtf") {
112	print &convertRTF($input_filename, $output_filestem, $output_type);
113	print "\n";
114	}
115	elsif ($input_type eq "pdf") {
116	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
117	print "\n";
118	}
119	elsif ($input_type eq "ps") {
120	print &convertPS($input_filename, $output_filestem, $output_type);
121	print "\n";
122	}
123	else {
124	print STDERR "Error: Unable to convert type '$input_type'\n";
125	exit(1);
126	}
127
128	# restore to original working directory
129	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
130
131	}
132
133	&main(@ARGV);
134
135
136
137	# Document-type conversion fucntions
138	#
139	# The following functions attempt to convert documents from their
140	# input type to the specified output type. If no output type was
141	# given, then they first attempt HTML, and then TEXT.
142	#
143	# Each returns the output type ("html" or "text") or "fail" if no
144	# conversion is possible.
145
146	# Convert a Microsoft word document
147
148	sub convertDOC {
149	($input_filename, $output_filestem, $output_type) = @_;
150
151	# Many .doc files are not in fact word documents!
152	my $realtype = &find_docfile_type($input_filename);
153
154	if ($realtype eq "word6" \|\| $realtype eq "word7" \|\| $realtype eq "word8") {
155	return &convertWord678($input_filename, $output_filestem, $output_type);
156	} elsif ($realtype eq "rtf") {
157	return &convertRTF($input_filename, $output_filestem, $output_type);
158	} else {
159	return &convertAnything($input_filename, $output_filestem, $output_type);
160	}
161	}
162
163	# Convert a Microsoft word 6/7/8 document
164
165	sub convertWord678 {
166	($input_filename, $output_filestem, $output_type) = @_;
167
168	my $success = 0;
169
170	# Attempt specialised conversion to HTML
171	if (!$output_type \|\| ($output_type =~ /html/i)) {
172	$success = &doc_to_html($input_filename, $output_filestem);
173	if ($success) {
174	return "html";
175	}
176	}
177
178	return &convertAnything($input_filename, $output_filestem, $output_type);
179	}
180
181
182	# Convert a Rich Text Format (RTF) file
183
184	sub convertRTF {
185	($input_filename, $output_filestem, $output_type) = @_;
186
187	my $success = 0;
188
189	# Attempt specialised conversion to HTML
190	if (!$output_type \|\| ($output_type =~ /html/i)) {
191	$success = &rtf_to_html($input_filename, $output_filestem);
192	if ($success) {
193	return "html";
194	}
195	}
196
197	return &convertAnything($input_filename, $output_filestem, $output_type);
198	}
199
200
201	# Convert an unidentified file
202
203	sub convertAnything {
204	($input_filename, $output_filestem, $output_type) = @_;
205
206	my $success = 0;
207
208	# Attempt simple conversion to HTML
209	if (!$output_type \|\| ($output_type =~ /html/i)) {
210	$success = &any_to_html($input_filename, $output_filestem);
211	if ($success) {
212	return "html";
213	}
214	}
215
216	# Convert to text
217	if (!$output_type \|\| ($output_type =~ /text/i)) {
218	$success = any_to_text($input_filename, $output_filestem);
219	if ($success) {
220	return "text";
221	}
222	}
223	return "fail";
224	}
225
226
227
228	# Convert an Adobe PDF document
229
230	sub convertPDF {
231	($dirname, $input_filename, $output_filestem, $output_type) = @_;
232
233	my $success = 0;
234
235	# Attempt conversion to HTML
236	if (!$output_type \|\| ($output_type =~ /html/i)) {
237	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
238	if ($success) {
239	return "html";
240	}
241	}
242
243	# Attempt conversion to TEXT
244	if (!$output_type \|\| ($output_type =~ /text/i)) {
245	$success = &pdf_to_text($input_filename, $output_filestem);
246	if ($success) {
247	return "text";
248	}
249	}
250
251	return "fail";
252
253	}
254
255
256	# Convert an Adobe PostScript document
257
258	sub convertPS {
259	($input_filename, $output_filestem, $output_type) = @_;
260
261	my $success = 0;
262
263	# Attempt conversion to TEXT
264	if (!$output_type \|\| ($output_type =~ /text/i)) {
265	$success = &ps_to_text($input_filename, $output_filestem);
266	if ($success) {
267	return "text";
268	}
269	}
270
271	return "fail";
272
273	}
274
275
276	# Find the real type of a .doc file
277	#
278	# We seem to have a lot of files with a .doc extension that are .rtf
279	# files or Word 5 files. This function attempts to tell the difference.
280
281	sub find_docfile_type {
282	($input_filename) = @_;
283
284	open(CHK, "<$input_filename");
285	binmode(CHK);
286	my $line = "";
287	my $first = 1;
288
289	while (<CHK>) {
290
291	$line = $_;
292
293	if ($first) {
294	# check to see if this is an rtf file
295	if ($line =~ /^\{\\rtf/) {
296	close(CHK);
297	return "rtf";
298	}
299	}
300
301	# is this is a word 6/7/8 document?
302	if ($line =~ /Word\.Document\.([678])/) {
303	close(CHK);
304	return "word$1";
305	}
306
307	$first = 0;
308
309	}
310
311	return "unknown";
312	}
313
314
315
316	# Specific type-to-type conversions
317	#
318	# Each of the following functions attempts to convert a document from
319	# a specific format to another. If they succeed yhey return 1 and leave
320	# the output document(s) in the appropriate place; if they fail they
321	# return 0 and delete any working files.
322
323
324	# Attempt to convert a word document to html with the wv program
325
326	sub doc_to_html {
327	($input_filename, $output_filestem) = @_;
328
329	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
330	$ENV{'GSDLOS'}, "wvWare");
331	$wvWare .= ".exe" if ($ENV{'GSDLOS'} =~ /^windows$/i);
332	return 0 unless (-e "$wvWare");
333
334	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "packages",
335	"wv", "wvHtml.xml");
336
337	$cmd = "";
338	if ($timeout) {$cmd = "ulimit -t $timeout;";}
339	$cmd .= "$wvWare --charset utf-8 --config $wv_conf";
340	$cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
341
342	# execute the command
343	if (system($cmd)>0)
344	{
345	print STDERR "Error executing wv converter: $!. Continuing...\n";
346	}
347
348	# Was the conversion successful?
349	if (-e "$output_filestem.html") {
350	open(TMP, "$output_filestem.html");
351	$line = <TMP>;
352	close(TMP);
353	if ($line && $line =~ /DOCTYPE HTML/) {
354	&util::rm("$output_filestem.err");
355	return 1;
356	} else {
357	# An error of some sort occurred
358	&util::rm("$output_filestem.html");
359	&util::rm("$output_filestem.err");
360	}
361	}
362
363	return 0;
364	}
365
366
367	# Attempt to convert an RTF document to html with rtftohtml
368	#
369	# rtf2html isn't distributed with Greenstone because it is not
370	# distributed under teh GPL. If you know of a better solution,
371	# please let me know.
372
373	sub rtf_to_html {
374	($input_filename, $output_filestem) = @_;
375
376	# formulate the command
377	my $r_cmd = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix",
378	"rtf2html", "rtf2html", "rtf2html");
379	$r_cmd = "rtf2html" unless (-e "$r_cmd");
380	return 0 unless (-e "$r_cmd");
381	$cmd = "";
382	if ($timeout) {$cmd = "ulimit -t $timeout;";}
383	$cmd .= "$r_cmd";
384	$cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
385
386	# execute the command
387	if (system($cmd)>0)
388	{
389	print STDERR "Error executing rtf converter: $!. Continuing...\n";
390	}
391
392	# Was the conversion successful?
393	if (-e "$output_filestem.html") {
394	open(TMP, "$output_filestem.html");
395	$line = <TMP>;
396	close(TMP);
397	if ($line && $line =~ /DOCTYPE HTML/) {
398	&util::rm("$output_filestem.err");
399	return 1;
400	} else {
401	# An error of some sort occurred
402	&util::rm("$output_filestem.html");
403	&util::rm("$output_filestem.err");
404	}
405	}
406	return 0;
407	}
408
409
410	# Convert a pdf file to html with the pdftohtml command
411
412	sub pdf_to_html {
413	($dirname, $input_filename, $output_filestem) = @_;
414
415	$cmd = "";
416	if ($timeout) {$cmd = "ulimit -t $timeout;";}
417	$cmd .= "pdftohtml.pl -F ";
418	$cmd .= " \"$input_filename\" \"$output_filestem\"";
419
420	if (system($cmd)!=0)
421	{
422	print STDERR "Error executing $cmd: $!\n";
423	return 0;
424	}
425
426	# make sure the converter made something
427	if (! -e "$output_filestem.html")
428	{
429	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
430	# print out the converters std err, if any
431	if (-e "$output_filestem.err") {
432	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
433	print STDERR "pdftohtml:\n";
434	while (<ERRLOG>) {
435	print STDERR "$_";
436	}
437	close ERRLOG;
438	}
439	return 0;
440	}
441
442	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
443	return 1;
444	}
445
446	# Convert a PDF file to text with the pdftotext command
447
448	sub pdf_to_text {
449	($dirname, $input_filename, $output_filestem) = @_;
450
451	$cmd = "pdftotext \"$input_filename\" > \"$output_filestem.text\"";
452	$cmd .= " 2> $output_filestem.err";
453
454	if (system($cmd)>0)
455	{
456	print STDERR "Error executing $cmd: $!\n";
457	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
458	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
459	return 0;
460	}
461
462	# make sure the converter made something
463	if (! -e "$output_filestem.html")
464	{
465	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
466	# print out the converters std err, if any
467	if (-e "$output_filestem.err") {
468	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
469	print STDERR "pdftotext:\n";
470	while (<ERRLOG>) {
471	print STDERR "$_";
472	}
473	close ERRLOG;
474	}
475	return 0;
476	}
477
478	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
479	return 1;
480	}
481
482	# Convert a PostScript document to text
483	# note - just using "ps2ascii" isn't good enough, as it
484	# returns 0 for a postscript interpreter error. ps2ascii is just
485	# a wrapper to "gs" anyway, so we use that cmd here.
486
487	sub ps_to_text {
488	($input_filename, $output_filestem) = @_;
489
490	my $cmd = "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
491	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
492	$cmd .= " 2> $output_filestem.err";
493	$!=0;
494	my $retcode=system($cmd);
495	$retcode = $? >> 8; # see man perlfunc - system for this...
496	# if system returns -1 \| 127 (couldn't start program), look at $! for message
497	my $error="";
498	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
499	elsif (! -e "$output_filestem.text") {
500	$error="did not create output file.\n";
501	}
502	else
503	{ # make sure the interpreter didn't get an error. It is technically
504	# possible for the actual text to start with this, but....
505	open PSOUT, "$output_filestem.text";
506	if (<PSOUT> =~ /^Error: (.*)/) {
507	$error="interpreter error - \"$1\"";
508	}
509	close PSOUT;
510	}
511	if ($error ne "")
512	{
513	print STDERR "PSPLUG: WARNING: Error executing gs: $error\n";
514	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
515	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
516
517	# Fine then. We'll just do a lousy job by ourselves...
518	# Based on 5-line regexp sed script found at:
519	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
520	#
521	print STDERR "PSPlug: Stripping text from postscript\n";
522	my $errorcode=0;
523	open (IN, "$input_filename")
524	\|\| ($errorcode=1, warn "Couldn't read file: $!");
525	open (OUT, ">$output_filestem.text")
526	\|\| ($errorcode=1, warn "Couldn't write file: $!");
527	if ($errorcode) {print STDERR "errors\n";return 0;}
528
529	my $text=""; # this is for whole .ps file...
530	while (<IN>) {
531	$text.=$_;
532	}
533	close IN;
534
535	# if ps has Page data, then use it to delete all stuff before it.
536	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
537
538	# remove all leading non-data stuff
539	$text =~ s/^.*?\(//s;
540
541	# remove all newline chars for easier processing
542	$text =~ s/\n//g;
543
544	# Big assumption here - assume that if any co-ordinates are
545	# given, then we are at the end of a sentence.
546	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
547
548	# special characters--
549	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
550
551	# ? ps text formatting (eg italics?) ?
552	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
553	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
554	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
555	# default - remove the rest
556	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
557
558	# attempt to add whitespace between words...
559	# this is based purely on observation, and may be completely wrong...
560	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
561	# eg I notice "b(" is sometimes NOT a space if preceded by a
562	# negative number.
563	$text =~ s/\)\d+ ?b\(/\) \( /g;
564
565	# change quoted braces to brackets
566	$text =~ s/([^\\])\\\(/$1\{/g;
567	$text =~ s/([^\\])\\\)/$1\}/g ;
568
569	# remove everything that is not between braces
570	$text =~ s/\)([^\(\)])+?\(//sg ;
571
572	# remove any Trailer eof stuff.
573	$text =~ s/\)[^\)]*$//sg;
574
575	### ligatures have special characters...
576	$text =~ s/\\013/ff/g;
577	$text =~ s/\\014/fi/g;
578	$text =~ s/\\015/fl/g;
579	$text =~ s/\\016/ffi/g;
580	$text =~ s/\\214/fi/g;
581	$text =~ s/\\215/fl/g;
582	$text =~ s/\\017/\n\* /g; # asterisk?
583	$text =~ s/\\023/\023/g; # e acute ('e)
584	$text =~ s/\\177/\252/g; # u"
585	# $text =~ s/ ?? /\344/g; # a"
586
587	print OUT "$text";
588	close OUT;
589	}
590	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
591	return 1;
592	}
593
594
595	# Convert any file to HTML with a crude perl implementation of the
596	# UNIX strings command.
597
598	sub any_to_html {
599	($input_filename, $output_filestem) = @_;
600
601	# First generate a text file
602	return 0 unless (&any_to_text($input_filename, $output_filestem));
603
604	# create an HTML file from the text file
605	open(TEXT, "<$output_filestem.text");
606	open(HTML, ">$output_filestem.html");
607
608	print HTML '<html><head>
609	<META HTTP-EQUIV="Content-Type" CONTENT="text/html">
610	<META NAME="GENERATOR" CONTENT="Greenstone any_to_html">
611	</head><body>';
612	print HTML "\n\n";
613
614	while (<TEXT>) {
615	print HTML "<p> ", $_;
616
617	}
618	print HTML "\n</body></html>\n";
619
620	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
621	return 1;
622	}
623
624	# Convert any file to TEXT with a crude perl implementation of the
625	# UNIX strings command.
626
627	sub any_to_text {
628	($input_filename, $output_filestem) = @_;
629
630	open(IN, "<$input_filename");
631	binmode(IN);
632	open(OUT, ">$output_filestem.text");
633
634	my ($line);
635	my $dgcount = 0;
636	while (<IN>) {
637	$line = $_;
638
639	# delete anything that isn't a printable character
640	$line =~ s/[^\040-\176]+/\n/sg;
641
642	# delete any string less than 10 characters long
643	$line =~ s/^.{0,9}$/\n/mg;
644	while ($line =~ /^.{1,9}$/m) {
645	$line =~ s/^.{0,9}$/\n/mg;
646	$line =~ s/\n+/\n/sg;
647	}
648
649	# remove extraneous whitespace
650	$line =~ s/\n+/\n/gs;
651	$line =~ s/^\n//gs;
652
653	# output whatever is left
654	if ($line =~ /[^\n ]/) {
655	print OUT $line;
656	}
657	}
658	return 1;
659	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: