Context Navigation

source: trunk/gsdl/bin/script/gsConvert.pl@ 2977

Last change on this file since 2977 was 2977, checked in by jrm21, 22 years ago
added infrastructure for calling an external powerpoint to html converter.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 25.4 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert Microsoft Word, RTF, Adobe PDF and PostScript
35	# using specialised conversion utilities. We can try to convert any file to
36	# text with a perl implementation of the UNIX strings command.
37	#
38	# We try to convert Postscript files to text using "gs" which is often on
39	# *nix machines. We fall back to performing weak text extraction by using
40	# regular expressions.
41
42	BEGIN {
43	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
44	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
45	}
46
47	use parsargv;
48	use util;
49	use Cwd;
50	use File::Basename;
51
52	# Are we running on WinNT or Win2000 (or later)?
53	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
54	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
55
56	sub print_usage
57	{
58	print STDERR "\n";
59	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
60	print STDERR " or text using third-party programs.\n\n";
61	print STDERR " usage: $0 [options] filename\n";
62	print STDERR " options:\n\t-type\tdoc\|pdf\|ps\|ppt\|rtf\t(input file type)\n";
63	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
64	print STDERR "\t-output\thtml\|text\n";
65	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
66	exit(1);
67	}
68
69	my $faillogfile="";
70
71	sub main
72	{
73	my (@ARGV) = @_;
74	my ($input_type,$output_type,$verbose,$timeout);
75
76	$timeout = 0;
77	# read command-line arguments
78	if (!parsargv::parse(\@ARGV,
79	'type/(doc\|pdf\|ps\|ppt\|rtf)/', \$input_type,
80	'/errlog/.*/', \$faillogfile,
81	'output/(html\|text)/', \$output_type,
82	'timeout/\d+/0',\$timeout,
83	'verbose/\d+/0', \$verbose))
84	{
85	print_usage();
86	}
87
88	# Make sure the input file exists and can be opened for reading
89	if (scalar(@ARGV!=1)) {
90	print_usage();
91	}
92
93	my $input_filename = $ARGV[0];
94	if (!-r $input_filename) {
95	print STDERR "Error: unable to open $input_filename for reading\n";
96	exit(1);
97	}
98
99	# Deduce filenames
100	my ($tailname,$dirname,$suffix)
101	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
102	my $output_filestem = &util::filename_cat($dirname, "$tailname");
103
104	if ($input_type eq "")
105	{
106	$input_type = lc (substr($suffix,1,length($suffix)-1));
107	}
108
109	# Change to temporary working directory
110	my $stored_dir = cwd();
111	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
112
113	# Select convert utility
114	if (!defined $input_type) {
115	print STDERR "Error: No filename extension or input type defined\n";
116	exit(1);
117	}
118	elsif ($input_type eq "doc") {
119	print &convertDOC($input_filename, $output_filestem, $output_type);
120	print "\n";
121	}
122	elsif ($input_type eq "rtf") {
123	print &convertRTF($input_filename, $output_filestem, $output_type);
124	print "\n";
125	}
126	elsif ($input_type eq "pdf") {
127	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
128	print "\n";
129	}
130	elsif ($input_type eq "ps") {
131	print &convertPS($input_filename, $output_filestem, $output_type);
132	print "\n";
133	}
134	elsif ($input_type eq "ppt") {
135	print &convertPPT($input_filename, $output_filestem, $output_type);
136	print "\n";
137	}
138	else {
139	print STDERR "Error: Unable to convert type '$input_type'\n";
140	exit(1);
141	}
142
143	# restore to original working directory
144	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
145
146	}
147
148	&main(@ARGV);
149
150
151
152	# Document-type conversion functions
153	#
154	# The following functions attempt to convert documents from their
155	# input type to the specified output type. If no output type was
156	# given, then they first attempt HTML, and then TEXT.
157	#
158	# Each returns the output type ("html" or "text") or "fail" if no
159	# conversion is possible.
160
161	# Convert a Microsoft word document
162
163	sub convertDOC {
164	($input_filename, $output_filestem, $output_type) = @_;
165
166	# Many .doc files are not in fact word documents!
167	my $realtype = &find_docfile_type($input_filename);
168
169	if ($realtype eq "word6" \|\| $realtype eq "word7" \|\| $realtype eq "word8") {
170	return &convertWord678($input_filename, $output_filestem, $output_type);
171	} elsif ($realtype eq "rtf") {
172	return &convertRTF($input_filename, $output_filestem, $output_type);
173	} else {
174	return &convertAnything($input_filename, $output_filestem, $output_type);
175	}
176	}
177
178	# Convert a Microsoft word 6/7/8 document
179
180	sub convertWord678 {
181	($input_filename, $output_filestem, $output_type) = @_;
182
183	my $success = 0;
184
185	# Attempt specialised conversion to HTML
186	if (!$output_type \|\| ($output_type =~ /html/i)) {
187	$success = &doc_to_html($input_filename, $output_filestem);
188	if ($success) {
189	return "html";
190	}
191	}
192
193	return &convertAnything($input_filename, $output_filestem, $output_type);
194	}
195
196
197	# Convert a Rich Text Format (RTF) file
198
199	sub convertRTF {
200	($input_filename, $output_filestem, $output_type) = @_;
201
202	my $success = 0;
203
204	# Attempt specialised conversion to HTML
205	if (!$output_type \|\| ($output_type =~ /html/i)) {
206	$success = &rtf_to_html($input_filename, $output_filestem);
207	if ($success) {
208	return "html";
209	}
210	}
211
212	# rtf is so ugly that's it's not worth running strings over.
213	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
214	# return &convertAnything($input_filename, $output_filestem, $output_type);
215	return "fail";
216	}
217
218
219	# Convert an unidentified file
220
221	sub convertAnything {
222	($input_filename, $output_filestem, $output_type) = @_;
223
224	my $success = 0;
225
226	# Attempt simple conversion to HTML
227	if (!$output_type \|\| ($output_type =~ /html/i)) {
228	$success = &any_to_html($input_filename, $output_filestem);
229	if ($success) {
230	return "html";
231	}
232	}
233
234	# Convert to text
235	if (!$output_type \|\| ($output_type =~ /text/i)) {
236	$success = &any_to_text($input_filename, $output_filestem);
237	if ($success) {
238	return "text";
239	}
240	}
241	return "fail";
242	}
243
244
245
246	# Convert an Adobe PDF document
247
248	sub convertPDF {
249	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
250
251	my $success = 0;
252
253	# Attempt conversion to HTML
254	if (!$output_type \|\| ($output_type =~ /html/i)) {
255	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
256	if ($success) {
257	return "html";
258	}
259	}
260
261	# Attempt conversion to TEXT
262	if (!$output_type \|\| ($output_type =~ /text/i)) {
263	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
264	if ($success) {
265	return "text";
266	}
267	}
268
269	return "fail";
270
271	}
272
273
274	# Convert an Adobe PostScript document
275
276	sub convertPS {
277	($input_filename, $output_filestem, $output_type) = @_;
278
279	my $success = 0;
280
281	# Attempt conversion to TEXT
282	if (!$output_type \|\| ($output_type =~ /text/i)) {
283	$success = &ps_to_text($input_filename, $output_filestem);
284	if ($success) {
285	return "text";
286	}
287	}
288
289	return "fail";
290
291	}
292
293
294	sub convertPPT {
295	my ($input_filename, $output_filestem, $output_type) = @_;
296
297	my $success = 0;
298
299	# Attempt conversion to HTML
300	if (!$output_type \|\| ($output_type =~ /html/i)) {
301	# formulate the command
302	$cmd = "";
303	$cmd .= "perl -S ppttohtml.pl ";
304	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
305	$cmd .= " 2>\"$output_filestem.err\""
306	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
307
308
309	# execute the command
310	$!=0;
311	if (system($cmd)!=0)
312	{
313	print STDERR "Powerpoint converter failed $!\n";
314	} else {
315	return "html";
316	}
317	}
318
319	$success = &any_to_text($input_filename, $output_filestem);
320	if ($success) {
321	return "text";
322	}
323
324	return "fail";
325	}
326
327
328
329
330
331	# Find the real type of a .doc file
332	#
333	# We seem to have a lot of files with a .doc extension that are .rtf
334	# files or Word 5 files. This function attempts to tell the difference.
335
336	sub find_docfile_type {
337	($input_filename) = @_;
338
339	open(CHK, "<$input_filename");
340	binmode(CHK);
341	my $line = "";
342	my $first = 1;
343
344	while (<CHK>) {
345
346	$line = $_;
347
348	if ($first) {
349	# check to see if this is an rtf file
350	if ($line =~ /^\{\\rtf/) {
351	close(CHK);
352	return "rtf";
353	}
354	$first = 0;
355	}
356
357	# is this is a word 6/7/8 document?
358	if ($line =~ /Word\.Document\.([678])/) {
359	close(CHK);
360	return "word$1";
361	}
362
363	}
364
365	return "unknown";
366	}
367
368
369
370	# Specific type-to-type conversions
371	#
372	# Each of the following functions attempts to convert a document from
373	# a specific format to another. If they succeed they return 1 and leave
374	# the output document(s) in the appropriate place; if they fail they
375	# return 0 and delete any working files.
376
377
378	# Attempt to convert a word document to html with the wv program
379
380	sub doc_to_html {
381	($input_filename, $output_filestem) = @_;
382
383	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
384	$ENV{'GSDLOS'}, "wvWare");
385
386	# don't include path on windows (to avoid having to play about
387	# with quoting when GSDLHOME might contain spaces) but assume
388	# that the PATH is set up correctly
389	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
390
391	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
392	"packages", "wv", "wvHtml.xml");
393
394	my $cmd = "";
395	if ($timeout) {$cmd = "ulimit -t $timeout;";}
396	$cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
397	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
398
399	# redirecting STDERR is a bad idea on windows 95/98
400	$cmd .= " 2> \"$output_filestem.err\""
401	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
402
403	# execute the command
404	$!=0;
405	if (system($cmd)!=0)
406	{
407	print STDERR "Error executing wv converter:$!\n";
408	if (-s "$output_filestem.err") {
409	open (ERRFILE, "<$output_filestem.err");
410
411	my $write_to_fail_log=0;
412	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
413	{$write_to_fail_log=1;}
414
415	my $line;
416	while ($line=<ERRFILE>) {
417	if ($line =~ /\w/) {
418	print STDERR "$line";
419	print FAILLOG "$line" if ($write_to_fail_log);
420	}
421	if ($line !~ m/startup error/) {next;}
422	print STDERR " (given an invalid .DOC file?)\n";
423	print FAILLOG " (given an invalid .DOC file?)\n"
424	if ($write_to_fail_log);
425
426	} # while ERRFILE
427	close FAILLOG if ($write_to_fail_log);
428	}
429	print STDERR "Continuing...\n";
430	return 0; # we can try any_to_text
431	}
432
433	# Was the conversion successful?
434
435	if (-s "$output_filestem.html") {
436	open(TMP, "$output_filestem.html");
437	$line = <TMP>;
438	close(TMP);
439	if ($line && $line =~ /DOCTYPE HTML/) {
440	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
441	return 1;
442	}
443	}
444
445	# If here, an error of some sort occurred
446	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
447	if (-e "$output_filestem.err") {
448	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
449	open (ERRLOG,"$output_filestem.err");
450	while (<ERRLOG>) {print FAILLOG $_;}
451	close FAILLOG;
452	close ERRLOG;
453	}
454	&util::rm("$output_filestem.err");
455	}
456
457	return 0;
458	}
459
460
461	# Attempt to convert an RTF document to html with rtftohtml
462
463	sub rtf_to_html {
464	my ($input_filename, $output_filestem) = @_;
465
466	# formulate the command
467	$cmd = "";
468	if ($timeout) {$cmd = "ulimit -t $timeout;";}
469	$cmd .= "rtftohtml";
470
471	# it automatically uses $output_filestem.html
472	$cmd .= " \"$input_filename\"";
473
474	$cmd .= " 2>\"$output_filestem.err\""
475	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
476
477
478	# execute the command
479	$!=0;
480	if (system($cmd)!=0)
481	{
482	print STDERR "Error executing rtf converter $!\n";
483	# don't currently bother printing out error log...
484	# keep going, in case it still created an HTML file...
485	}
486
487	# Was the conversion successful?
488	my $was_successful=0;
489	if (-s "$output_filestem.html") {
490	# make sure we have some content other than header
491	open (HTML, "$output_filestem.html"); # what to do if fail?
492	my $line;
493	my $past_header=0;
494	while ($line=<HTML>) {
495
496	if ($past_header == 0) {
497	if ($line =~ /<body>/) {$past_header=1;}
498	next;
499	}
500
501	$line =~ s/<[^>]+>//g;
502	if ($line =~ /\w/ && $past_header) { # we found some content...
503	$was_successful=1;
504	last;
505	}
506	}
507	close HTML;
508	}
509
510	if ($was_successful) {
511	&util::rm("$output_filestem.err")
512	if (-e "$output_filestem.err");
513	# insert the (modified) table of contents, if it exists.
514	if (-e "${output_filestem}_ToC.html") {
515	&util::mv("$output_filestem.html","$output_filestem.src");
516	my $open_failed=0;
517	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
518	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
519	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
520
521	if ($open_failed) {
522	close HTMLSRC;
523	close TOC;
524	close HTML;
525	&util::mv("$output_filestem.src","$output_filestem.html");
526	return 1;
527	}
528
529	# print out header info from src html.
530	while (($_ = <HTMLSRC>) =~ /\w/) {
531	print HTML "$_";
532	}
533
534	# print out table of contents, making links relative
535	<TOC>; <TOC>; # ignore first 2 lines
536	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
537	my $line;
538	while ($line=<TOC>) {
539	$line =~ s@</body></html>$@@ ; # only last line has this
540	# make link relative
541	$line =~ s@href=\"[^\#]+@href=\"@;
542	print HTML $line;
543	}
544	close TOC;
545
546	# rest of html src
547	while (<HTMLSRC>) {
548	print HTML $_;
549	}
550	close HTMLSRC;
551	close HTML;
552
553	&util::rm("${output_filestem}_ToC.html");
554	&util::rm("${output_filestem}.src");
555	}
556	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
557	return 1; # success
558	}
559
560	if (-e "$output_filestem.err") {
561	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
562	{
563	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
564	print FAILLOG " (rtf file might be too recent):\n";
565	open (ERRLOG, "$output_filestem.err");
566	while (<ERRLOG>) {print FAILLOG $_;}
567	close ERRLOG;
568	close FAILLOG;
569	}
570	&util::rm("$output_filestem.err");
571	}
572
573	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
574
575	return 0;
576	}
577
578
579	# Convert a pdf file to html with the pdftohtml command
580
581	sub pdf_to_html {
582	my ($dirname, $input_filename, $output_filestem) = @_;
583
584	$cmd = "";
585	if ($timeout) {$cmd = "ulimit -t $timeout;";}
586	$cmd .= "perl -S pdftohtml.pl ";
587	$cmd .= " \"$input_filename\" \"$output_filestem\"";
588
589	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000) {
590	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
591	} else {
592	$cmd .= " > \"$output_filestem.err\"";
593	}
594
595	$!=0;
596
597	my $retval=system($cmd);
598	if ($retval!=0)
599	{
600	print STDERR "Error executing pdftohtml.pl";
601	if ($!) {print STDERR ": $!";}
602	print STDERR "\n";
603	}
604
605	# make sure the converter made something
606	if ($retval!=0 \|\| ! -s "$output_filestem.html")
607	{
608	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
609	# print out the converter's std err, if any
610	if (-s "$output_filestem.err") {
611	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
612	print STDERR "pdftohtml error log:\n";
613	while (<ERRLOG>) {
614	print STDERR "$_";
615	}
616	close ERRLOG;
617	}
618	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
619	if (-e "$output_filestem.err") {
620	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
621	{
622	open (ERRLOG, "$output_filestem.err");
623	while (<ERRLOG>) {print FAILLOG $_;}
624	close ERRLOG;
625	close FAILLOG;
626	}
627	&util::rm("$output_filestem.err");
628	}
629	return 0;
630	}
631
632	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
633	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
634	return 1;
635	}
636
637	# Convert a PDF file to text with the pdftotext command
638
639	sub pdf_to_text {
640	my ($dirname, $input_filename, $output_filestem) = @_;
641
642	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
643
644	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
645	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
646	} else {
647	$cmd .= " > \"$output_filestem.err\"";
648	}
649
650	if (system($cmd)!=0)
651	{
652	print STDERR "Error executing $cmd: $!\n";
653	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
654	}
655
656	# make sure there is some extracted text.
657	if (-e "$output_filestem.text") {
658	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
659	binmode(EXTR_TEXT); # just in case...
660	my $line="";
661	my $seen_text=0;
662	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
663	if ($line=~ /\w/) {$seen_text=1;}
664	}
665	close EXTR_TEXT;
666	if ($seen_text==0) { # no text was extracted
667	print STDERR "Error: pdftotext found no text\n";
668	&util::rm("$output_filestem.text");
669	}
670	}
671
672	# make sure the converter made something
673	if (! -s "$output_filestem.text")
674	{
675	# print out the converters std err, if any
676	if (-s "$output_filestem.err") {
677	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
678	print STDERR "pdftotext error log:\n";
679	while (<ERRLOG>) {
680	print STDERR "$_";
681	}
682	close ERRLOG;
683	}
684	# does this converter create a .out file?
685	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
686	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
687	if (-e "$output_filestem.err") {
688	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
689	{
690	open (ERRLOG,"$output_filestem.err");
691	while (<ERRLOG>) {print FAILLOG $_;}
692	close ERRLOG;
693	close FAILLOG;
694	}
695	&util::rm("$output_filestem.err");
696	}
697	return 0;
698	}
699	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
700	return 1;
701	}
702
703	# Convert a PostScript document to text
704	# note - just using "ps2ascii" isn't good enough, as it
705	# returns 0 for a postscript interpreter error. ps2ascii is just
706	# a wrapper to "gs" anyway, so we use that cmd here.
707
708	sub ps_to_text {
709	my ($input_filename, $output_filestem) = @_;
710
711	my $error = "";
712
713	# if we're on windows we'll fall straight through without attempting
714	# to use gs
715	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
716	$error = "Windows does not support gs";
717
718	} else {
719	my $cmd = "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
720	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
721	$cmd .= " 2> $output_filestem.err";
722	$!=0;
723
724	my $retcode=system($cmd);
725	$retcode = $? >> 8; # see man perlfunc - system for this...
726	# if system returns -1 \| 127 (couldn't start program), look at $! for message
727
728	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
729	elsif (! -e "$output_filestem.text") {
730	$error="did not create output file.\n";
731	}
732	else
733	{ # make sure the interpreter didn't get an error. It is technically
734	# possible for the actual text to start with this, but....
735	open PSOUT, "$output_filestem.text";
736	if (<PSOUT> =~ /^Error: (.*)/) {
737	$error="interpreter error - \"$1\"";
738	}
739	close PSOUT;
740	}
741	}
742
743	if ($error ne "")
744	{
745	print STDERR "Warning: Error executing gs: $error\n";
746	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
747
748	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
749	{
750	print FAILLOG "gs - $error\n";
751	if (-e "$output_filestem.err") {
752	open(ERRLOG, "$output_filestem.err");
753	while (<ERRLOG>) {print FAILLOG $_;}
754	close ERRLOG;
755	}
756	close FAILLOG;
757	}
758	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
759
760
761	# Fine then. We'll just do a lousy job by ourselves...
762	# Based on 5-line regexp sed script found at:
763	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
764	#
765	print STDERR "Stripping text from postscript\n";
766	my $errorcode=0;
767	open (IN, "$input_filename")
768	\|\| ($errorcode=1, warn "Couldn't read file: $!");
769	open (OUT, ">$output_filestem.text")
770	\|\| ($errorcode=1, warn "Couldn't write file: $!");
771	if ($errorcode) {print STDERR "errors\n";return 0;}
772
773	my $text=""; # this is for whole .ps file...
774	$text = join('', <IN>); # see man perlport, under "System Resources"
775	close IN;
776
777	# Make sure this is a ps file...
778	if ($text !~ /^%!/) {
779	print STDERR "Bad postscript header: not '%!'\n";
780	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
781	{
782	print FAILLOG "Bad postscript header: not '%!'\n";
783	close FAILLOG;
784	}
785	return 0;
786	}
787
788	# if ps has Page data, then use it to delete all stuff before it.
789	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
790
791	# remove all leading non-data stuff
792	$text =~ s/^.*?\(//s;
793
794	# remove all newline chars for easier processing
795	$text =~ s/\n//g;
796
797	# Big assumption here - assume that if any co-ordinates are
798	# given, then we are at the end of a sentence.
799	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
800
801	# special characters--
802	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
803
804	# ? ps text formatting (eg italics?) ?
805	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
806	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
807	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
808	# default - remove the rest
809	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
810
811	# attempt to add whitespace between words...
812	# this is based purely on observation, and may be completely wrong...
813	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
814	# eg I notice "b(" is sometimes NOT a space if preceded by a
815	# negative number.
816	$text =~ s/\)\d+ ?b\(/\) \( /g;
817
818	# change quoted braces to brackets
819	$text =~ s/([^\\])\\\(/$1\{/g;
820	$text =~ s/([^\\])\\\)/$1\}/g ;
821
822	# remove everything that is not between braces
823	$text =~ s/\)([^\(\)])+?\(//sg ;
824
825	# remove any Trailer eof stuff.
826	$text =~ s/\)[^\)]*$//sg;
827
828	### ligatures have special characters...
829	$text =~ s/\\013/ff/g;
830	$text =~ s/\\014/fi/g;
831	$text =~ s/\\015/fl/g;
832	$text =~ s/\\016/ffi/g;
833	$text =~ s/\\214/fi/g;
834	$text =~ s/\\215/fl/g;
835	$text =~ s/\\017/\n\* /g; # asterisk?
836	$text =~ s/\\023/\023/g; # e acute ('e)
837	$text =~ s/\\177/\252/g; # u"
838	# $text =~ s/ ?? /\344/g; # a"
839
840	print OUT "$text";
841	close OUT;
842	}
843	# wrap the text - use a minimum length. ie, first space after this length.
844	my $wrap_length=72;
845	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
846	open INFILE, "$output_filestem.text.tmp" \|\|
847	die "Couldn't open file: $!";
848	open OUTFILE, ">$output_filestem.text" \|\|
849	die "Couldn't open file for writing: $!";
850	my $line="";
851	while ($line=<INFILE>) {
852	while (length($line)>0) {
853	if (length($line)>$wrap_length) {
854	$line =~ s/^(.{$wrap_length}[^\s])\s//;
855	print OUTFILE "$1\n";
856	} else {
857	print OUTFILE "$line";
858	$line="";
859	}
860	}
861	}
862	close INFILE;
863	close OUTFILE;
864	&util::rm("$output_filestem.text.tmp");
865
866	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
867	return 1;
868	}
869
870
871	# Convert any file to HTML with a crude perl implementation of the
872	# UNIX strings command.
873
874	sub any_to_html {
875	($input_filename, $output_filestem) = @_;
876
877	# First generate a text file
878	return 0 unless (&any_to_text($input_filename, $output_filestem));
879
880	# create an HTML file from the text file
881	open(TEXT, "<$output_filestem.text");
882	open(HTML, ">$output_filestem.html");
883
884	print HTML "<html><head>\n";
885	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
886	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
887	print HTML "</head><body>\n\n";
888
889	my $line;
890	while ($line=<TEXT>) {
891	$line =~ s/</</g;
892	$line =~ s/>/>/g;
893	if ($line =~ /^\s*$/) {
894	print HTML "<p>";
895	} else {
896	print HTML "<br> ", $line;
897	}
898	}
899	print HTML "\n</body></html>\n";
900
901	close HTML;
902	close TEXT;
903
904	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
905	return 1;
906	}
907
908	# Convert any file to TEXT with a crude perl implementation of the
909	# UNIX strings command.
910	# Note - this assumes ascii charsets :( (jrm21)
911
912	sub any_to_text {
913	($input_filename, $output_filestem) = @_;
914
915	open(IN, "<$input_filename") \|\| return 0;
916	binmode(IN);
917	open(OUT, ">$output_filestem.text") \|\| return 0;
918
919	my ($line);
920	my $output_line_count = 0;
921	while (<IN>) {
922	$line = $_;
923
924	# delete anything that isn't a printable character
925	$line =~ s/[^\040-\176]+/\n/sg;
926
927	# delete any string less than 10 characters long
928	$line =~ s/^.{0,9}$/\n/mg;
929	while ($line =~ /^.{1,9}$/m) {
930	$line =~ s/^.{0,9}$/\n/mg;
931	$line =~ s/\n+/\n/sg;
932	}
933
934	# remove extraneous whitespace
935	$line =~ s/\n+/\n/gs;
936	$line =~ s/^\n//gs;
937
938	# output whatever is left
939	if ($line =~ /[^\n ]/) {
940	print OUT $line;
941	++$output_line_count;
942	}
943	}
944
945	close OUT;
946	close IN;
947
948	if ($output_line_count) { # try to protect against binary only formats
949	return 1;
950	}
951
952	&util::rm("$output_filestem.text");
953	return 0;
954
955	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: