Context Navigation

source: trunk/gsdl/bin/script/gsConvert.pl@ 3246

Last change on this file since 3246 was 3246, checked in by jrm21, 22 years ago
RTF files that end in .doc were converted to $filestem.doc.html by default, so we force the output filename instead.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 26.3 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use parsargv;
52	use util;
53	use Cwd;
54	use File::Basename;
55
56	# Are we running on WinNT or Win2000 (or later)?
57	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
58	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
59
60	sub print_usage
61	{
62	print STDERR "\n";
63	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
64	print STDERR " or text using third-party programs.\n\n";
65	print STDERR " usage: $0 [options] filename\n";
66	print STDERR " options:\n\t-type\tdoc\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
67	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
68	print STDERR "\t-output\thtml\|text\n";
69	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
70	exit(1);
71	}
72
73	my $faillogfile="";
74
75	sub main
76	{
77	my (@ARGV) = @_;
78	my ($input_type,$output_type,$verbose,$timeout);
79
80	$timeout = 0;
81	# read command-line arguments
82	if (!parsargv::parse(\@ARGV,
83	'type/(doc\|pdf\|ps\|ppt\|rtf\|xls)/', \$input_type,
84	'/errlog/.*/', \$faillogfile,
85	'output/(html\|text)/', \$output_type,
86	'timeout/\d+/0',\$timeout,
87	'verbose/\d+/0', \$verbose))
88	{
89	print_usage();
90	}
91
92	# Make sure the input file exists and can be opened for reading
93	if (scalar(@ARGV!=1)) {
94	print_usage();
95	}
96
97	my $input_filename = $ARGV[0];
98	if (!-r $input_filename) {
99	print STDERR "Error: unable to open $input_filename for reading\n";
100	exit(1);
101	}
102
103	# Deduce filenames
104	my ($tailname,$dirname,$suffix)
105	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
106	my $output_filestem = &util::filename_cat($dirname, "$tailname");
107
108	if ($input_type eq "")
109	{
110	$input_type = lc (substr($suffix,1,length($suffix)-1));
111	}
112
113	# Change to temporary working directory
114	my $stored_dir = cwd();
115	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
116
117	# Select convert utility
118	if (!defined $input_type) {
119	print STDERR "Error: No filename extension or input type defined\n";
120	exit(1);
121	}
122	elsif ($input_type eq "doc") {
123	print &convertDOC($input_filename, $output_filestem, $output_type);
124	print "\n";
125	}
126	elsif ($input_type eq "rtf") {
127	print &convertRTF($input_filename, $output_filestem, $output_type);
128	print "\n";
129	}
130	elsif ($input_type eq "pdf") {
131	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
132	print "\n";
133	}
134	elsif ($input_type eq "ps") {
135	print &convertPS($input_filename, $output_filestem, $output_type);
136	print "\n";
137	}
138	elsif ($input_type eq "ppt") {
139	print &convertPPT($input_filename, $output_filestem, $output_type);
140	print "\n";
141	}
142	elsif ($input_type eq "xls") {
143	print &convertXLS($input_filename, $output_filestem, $output_type);
144	print "\n";
145	}
146	else {
147	print STDERR "Error: Unable to convert type '$input_type'\n";
148	exit(1);
149	}
150
151	# restore to original working directory
152	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
153
154	}
155
156	&main(@ARGV);
157
158
159
160	# Document-type conversion functions
161	#
162	# The following functions attempt to convert documents from their
163	# input type to the specified output type. If no output type was
164	# given, then they first attempt HTML, and then TEXT.
165	#
166	# Each returns the output type ("html" or "text") or "fail" if no
167	# conversion is possible.
168
169	# Convert a Microsoft word document
170
171	sub convertDOC {
172	($input_filename, $output_filestem, $output_type) = @_;
173
174	# Many .doc files are not in fact word documents!
175	my $realtype = &find_docfile_type($input_filename);
176
177	if ($realtype eq "word6" \|\| $realtype eq "word7" \|\| $realtype eq "word8") {
178	return &convertWord678($input_filename, $output_filestem, $output_type);
179	} elsif ($realtype eq "rtf") {
180	return &convertRTF($input_filename, $output_filestem, $output_type);
181	} else {
182	return &convertAnything($input_filename, $output_filestem, $output_type);
183	}
184	}
185
186	# Convert a Microsoft word 6/7/8 document
187
188	sub convertWord678 {
189	($input_filename, $output_filestem, $output_type) = @_;
190
191	my $success = 0;
192
193	# Attempt specialised conversion to HTML
194	if (!$output_type \|\| ($output_type =~ /html/i)) {
195	$success = &doc_to_html($input_filename, $output_filestem);
196	if ($success) {
197	return "html";
198	}
199	}
200
201	return &convertAnything($input_filename, $output_filestem, $output_type);
202	}
203
204
205	# Convert a Rich Text Format (RTF) file
206
207	sub convertRTF {
208	($input_filename, $output_filestem, $output_type) = @_;
209
210	my $success = 0;
211
212	# Attempt specialised conversion to HTML
213	if (!$output_type \|\| ($output_type =~ /html/i)) {
214	$success = &rtf_to_html($input_filename, $output_filestem);
215	if ($success) {
216	return "html";
217	}
218	}
219
220	# rtf is so ugly that's it's not worth running strings over.
221	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
222	# return &convertAnything($input_filename, $output_filestem, $output_type);
223	return "fail";
224	}
225
226
227	# Convert an unidentified file
228
229	sub convertAnything {
230	($input_filename, $output_filestem, $output_type) = @_;
231
232	my $success = 0;
233
234	# Attempt simple conversion to HTML
235	if (!$output_type \|\| ($output_type =~ /html/i)) {
236	$success = &any_to_html($input_filename, $output_filestem);
237	if ($success) {
238	return "html";
239	}
240	}
241
242	# Convert to text
243	if (!$output_type \|\| ($output_type =~ /text/i)) {
244	$success = &any_to_text($input_filename, $output_filestem);
245	if ($success) {
246	return "text";
247	}
248	}
249	return "fail";
250	}
251
252
253
254	# Convert an Adobe PDF document
255
256	sub convertPDF {
257	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
258
259	my $success = 0;
260
261	# Attempt conversion to HTML
262	if (!$output_type \|\| ($output_type =~ /html/i)) {
263	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
264	if ($success) {
265	return "html";
266	}
267	}
268
269	# Attempt conversion to TEXT
270	if (!$output_type \|\| ($output_type =~ /text/i)) {
271	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
272	if ($success) {
273	return "text";
274	}
275	}
276
277	return "fail";
278
279	}
280
281
282	# Convert an Adobe PostScript document
283
284	sub convertPS {
285	($input_filename, $output_filestem, $output_type) = @_;
286
287	my $success = 0;
288
289	# Attempt conversion to TEXT
290	if (!$output_type \|\| ($output_type =~ /text/i)) {
291	$success = &ps_to_text($input_filename, $output_filestem);
292	if ($success) {
293	return "text";
294	}
295	}
296
297	return "fail";
298
299	}
300
301
302	sub convertPPT {
303	my ($input_filename, $output_filestem, $output_type) = @_;
304
305	my $success = 0;
306
307	# Attempt conversion to HTML
308	if (!$output_type \|\| ($output_type =~ /html/i)) {
309	# formulate the command
310	$cmd = "";
311	$cmd .= "perl -S ppttohtml.pl ";
312	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
313	$cmd .= " 2>\"$output_filestem.err\""
314	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
315
316
317	# execute the command
318	$!=0;
319	if (system($cmd)!=0)
320	{
321	print STDERR "Powerpoint 95/97 converter failed $!\n";
322	} else {
323	return "html";
324	}
325	}
326
327	$success = &any_to_text($input_filename, $output_filestem);
328	if ($success) {
329	return "text";
330	}
331
332	return "fail";
333	}
334
335
336	sub convertXLS {
337	my ($input_filename, $output_filestem, $output_type) = @_;
338
339	my $success = 0;
340
341	# Attempt conversion to HTML
342	if (!$output_type \|\| ($output_type =~ /html/i)) {
343	# formulate the command
344	$cmd = "";
345	$cmd .= "perl -S xlstohtml.pl ";
346	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
347	$cmd .= " 2>\"$output_filestem.err\""
348	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
349
350
351	# execute the command
352	$!=0;
353	if (system($cmd)!=0)
354	{
355	print STDERR "Excel 95/97 converter failed $!\n";
356	} else {
357	return "html";
358	}
359	}
360
361	$success = &any_to_text($input_filename, $output_filestem);
362	if ($success) {
363	return "text";
364	}
365
366	return "fail";
367	}
368
369
370
371
372
373	# Find the real type of a .doc file
374	#
375	# We seem to have a lot of files with a .doc extension that are .rtf
376	# files or Word 5 files. This function attempts to tell the difference.
377
378	sub find_docfile_type {
379	($input_filename) = @_;
380
381	open(CHK, "<$input_filename");
382	binmode(CHK);
383	my $line = "";
384	my $first = 1;
385
386	while (<CHK>) {
387
388	$line = $_;
389
390	if ($first) {
391	# check to see if this is an rtf file
392	if ($line =~ /^\{\\rtf/) {
393	close(CHK);
394	return "rtf";
395	}
396	$first = 0;
397	}
398
399	# is this is a word 6/7/8 document?
400	if ($line =~ /Word\.Document\.([678])/) {
401	close(CHK);
402	return "word$1";
403	}
404
405	}
406
407	return "unknown";
408	}
409
410
411
412	# Specific type-to-type conversions
413	#
414	# Each of the following functions attempts to convert a document from
415	# a specific format to another. If they succeed they return 1 and leave
416	# the output document(s) in the appropriate place; if they fail they
417	# return 0 and delete any working files.
418
419
420	# Attempt to convert a word document to html with the wv program
421
422	sub doc_to_html {
423	($input_filename, $output_filestem) = @_;
424
425	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
426	$ENV{'GSDLOS'}, "wvWare");
427
428	# don't include path on windows (to avoid having to play about
429	# with quoting when GSDLHOME might contain spaces) but assume
430	# that the PATH is set up correctly
431	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
432
433	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
434	"packages", "wv", "wvHtml.xml");
435
436	my $cmd = "";
437	if ($timeout) {$cmd = "ulimit -t $timeout;";}
438	$cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
439	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
440
441	# redirecting STDERR is a bad idea on windows 95/98
442	$cmd .= " 2> \"$output_filestem.err\""
443	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
444
445	# execute the command
446	$!=0;
447	if (system($cmd)!=0)
448	{
449	print STDERR "Error executing wv converter:$!\n";
450	if (-s "$output_filestem.err") {
451	open (ERRFILE, "<$output_filestem.err");
452
453	my $write_to_fail_log=0;
454	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
455	{$write_to_fail_log=1;}
456
457	my $line;
458	while ($line=<ERRFILE>) {
459	if ($line =~ /\w/) {
460	print STDERR "$line";
461	print FAILLOG "$line" if ($write_to_fail_log);
462	}
463	if ($line !~ m/startup error/) {next;}
464	print STDERR " (given an invalid .DOC file?)\n";
465	print FAILLOG " (given an invalid .DOC file?)\n"
466	if ($write_to_fail_log);
467
468	} # while ERRFILE
469	close FAILLOG if ($write_to_fail_log);
470	}
471	print STDERR "Continuing...\n";
472	return 0; # we can try any_to_text
473	}
474
475	# Was the conversion successful?
476
477	if (-s "$output_filestem.html") {
478	open(TMP, "$output_filestem.html");
479	$line = <TMP>;
480	close(TMP);
481	if ($line && $line =~ /DOCTYPE HTML/) {
482	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
483	return 1;
484	}
485	}
486
487	# If here, an error of some sort occurred
488	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
489	if (-e "$output_filestem.err") {
490	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
491	open (ERRLOG,"$output_filestem.err");
492	while (<ERRLOG>) {print FAILLOG $_;}
493	close FAILLOG;
494	close ERRLOG;
495	}
496	&util::rm("$output_filestem.err");
497	}
498
499	return 0;
500	}
501
502
503	# Attempt to convert an RTF document to html with rtftohtml
504
505	sub rtf_to_html {
506	my ($input_filename, $output_filestem) = @_;
507
508	# formulate the command
509	$cmd = "";
510	if ($timeout) {$cmd = "ulimit -t $timeout;";}
511	$cmd .= "rtftohtml";
512
513	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
514
515	$cmd .= " 2>\"$output_filestem.err\""
516	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
517
518
519	# execute the command
520	$!=0;
521	if (system($cmd)!=0)
522	{
523	print STDERR "Error executing rtf converter $!\n";
524	# don't currently bother printing out error log...
525	# keep going, in case it still created an HTML file...
526	}
527
528	# Was the conversion successful?
529	my $was_successful=0;
530	if (-s "$output_filestem.html") {
531	# make sure we have some content other than header
532	open (HTML, "$output_filestem.html"); # what to do if fail?
533	my $line;
534	my $past_header=0;
535	while ($line=<HTML>) {
536
537	if ($past_header == 0) {
538	if ($line =~ /<body>/) {$past_header=1;}
539	next;
540	}
541
542	$line =~ s/<[^>]+>//g;
543	if ($line =~ /\w/ && $past_header) { # we found some content...
544	$was_successful=1;
545	last;
546	}
547	}
548	close HTML;
549	}
550
551	if ($was_successful) {
552	&util::rm("$output_filestem.err")
553	if (-e "$output_filestem.err");
554	# insert the (modified) table of contents, if it exists.
555	if (-e "${output_filestem}_ToC.html") {
556	&util::mv("$output_filestem.html","$output_filestem.src");
557	my $open_failed=0;
558	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
559	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
560	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
561
562	if ($open_failed) {
563	close HTMLSRC;
564	close TOC;
565	close HTML;
566	&util::mv("$output_filestem.src","$output_filestem.html");
567	return 1;
568	}
569
570	# print out header info from src html.
571	while (($_ = <HTMLSRC>) =~ /\w/) {
572	print HTML "$_";
573	}
574
575	# print out table of contents, making links relative
576	<TOC>; <TOC>; # ignore first 2 lines
577	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
578	my $line;
579	while ($line=<TOC>) {
580	$line =~ s@</body></html>$@@ ; # only last line has this
581	# make link relative
582	$line =~ s@href=\"[^\#]+@href=\"@;
583	print HTML $line;
584	}
585	close TOC;
586
587	# rest of html src
588	while (<HTMLSRC>) {
589	print HTML $_;
590	}
591	close HTMLSRC;
592	close HTML;
593
594	&util::rm("${output_filestem}_ToC.html");
595	&util::rm("${output_filestem}.src");
596	}
597	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
598	return 1; # success
599	}
600
601	if (-e "$output_filestem.err") {
602	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
603	{
604	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
605	print FAILLOG " (rtf file might be too recent):\n";
606	open (ERRLOG, "$output_filestem.err");
607	while (<ERRLOG>) {print FAILLOG $_;}
608	close ERRLOG;
609	close FAILLOG;
610	}
611	&util::rm("$output_filestem.err");
612	}
613
614	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
615
616	return 0;
617	}
618
619
620	# Convert a pdf file to html with the pdftohtml command
621
622	sub pdf_to_html {
623	my ($dirname, $input_filename, $output_filestem) = @_;
624
625	$cmd = "";
626	if ($timeout) {$cmd = "ulimit -t $timeout;";}
627	$cmd .= "perl -S pdftohtml.pl ";
628	$cmd .= " \"$input_filename\" \"$output_filestem\"";
629
630	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000) {
631	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
632	} else {
633	$cmd .= " > \"$output_filestem.err\"";
634	}
635
636	$!=0;
637
638	my $retval=system($cmd);
639	if ($retval!=0)
640	{
641	print STDERR "Error executing pdftohtml.pl";
642	if ($!) {print STDERR ": $!";}
643	print STDERR "\n";
644	}
645
646	# make sure the converter made something
647	if ($retval!=0 \|\| ! -s "$output_filestem.html")
648	{
649	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
650	# print out the converter's std err, if any
651	if (-s "$output_filestem.err") {
652	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
653	print STDERR "pdftohtml error log:\n";
654	while (<ERRLOG>) {
655	print STDERR "$_";
656	}
657	close ERRLOG;
658	}
659	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
660	if (-e "$output_filestem.err") {
661	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
662	{
663	open (ERRLOG, "$output_filestem.err");
664	while (<ERRLOG>) {print FAILLOG $_;}
665	close ERRLOG;
666	close FAILLOG;
667	}
668	&util::rm("$output_filestem.err");
669	}
670	return 0;
671	}
672
673	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
674	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
675	return 1;
676	}
677
678	# Convert a PDF file to text with the pdftotext command
679
680	sub pdf_to_text {
681	my ($dirname, $input_filename, $output_filestem) = @_;
682
683	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
684
685	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
686	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
687	} else {
688	$cmd .= " > \"$output_filestem.err\"";
689	}
690
691	if (system($cmd)!=0)
692	{
693	print STDERR "Error executing $cmd: $!\n";
694	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
695	}
696
697	# make sure there is some extracted text.
698	if (-e "$output_filestem.text") {
699	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
700	binmode(EXTR_TEXT); # just in case...
701	my $line="";
702	my $seen_text=0;
703	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
704	if ($line=~ /\w/) {$seen_text=1;}
705	}
706	close EXTR_TEXT;
707	if ($seen_text==0) { # no text was extracted
708	print STDERR "Error: pdftotext found no text\n";
709	&util::rm("$output_filestem.text");
710	}
711	}
712
713	# make sure the converter made something
714	if (! -s "$output_filestem.text")
715	{
716	# print out the converters std err, if any
717	if (-s "$output_filestem.err") {
718	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
719	print STDERR "pdftotext error log:\n";
720	while (<ERRLOG>) {
721	print STDERR "$_";
722	}
723	close ERRLOG;
724	}
725	# does this converter create a .out file?
726	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
727	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
728	if (-e "$output_filestem.err") {
729	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
730	{
731	open (ERRLOG,"$output_filestem.err");
732	while (<ERRLOG>) {print FAILLOG $_;}
733	close ERRLOG;
734	close FAILLOG;
735	}
736	&util::rm("$output_filestem.err");
737	}
738	return 0;
739	}
740	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
741	return 1;
742	}
743
744	# Convert a PostScript document to text
745	# note - just using "ps2ascii" isn't good enough, as it
746	# returns 0 for a postscript interpreter error. ps2ascii is just
747	# a wrapper to "gs" anyway, so we use that cmd here.
748
749	sub ps_to_text {
750	my ($input_filename, $output_filestem) = @_;
751
752	my $error = "";
753
754	# if we're on windows we'll fall straight through without attempting
755	# to use gs
756	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
757	$error = "Windows does not support gs";
758
759	} else {
760	my $cmd = "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
761	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
762	$cmd .= " 2> $output_filestem.err";
763	$!=0;
764
765	my $retcode=system($cmd);
766	$retcode = $? >> 8; # see man perlfunc - system for this...
767	# if system returns -1 \| 127 (couldn't start program), look at $! for message
768
769	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
770	elsif (! -e "$output_filestem.text") {
771	$error="did not create output file.\n";
772	}
773	else
774	{ # make sure the interpreter didn't get an error. It is technically
775	# possible for the actual text to start with this, but....
776	open PSOUT, "$output_filestem.text";
777	if (<PSOUT> =~ /^Error: (.*)/) {
778	$error="interpreter error - \"$1\"";
779	}
780	close PSOUT;
781	}
782	}
783
784	if ($error ne "")
785	{
786	print STDERR "Warning: Error executing gs: $error\n";
787	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
788
789	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
790	{
791	print FAILLOG "gs - $error\n";
792	if (-e "$output_filestem.err") {
793	open(ERRLOG, "$output_filestem.err");
794	while (<ERRLOG>) {print FAILLOG $_;}
795	close ERRLOG;
796	}
797	close FAILLOG;
798	}
799	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
800
801
802	# Fine then. We'll just do a lousy job by ourselves...
803	# Based on 5-line regexp sed script found at:
804	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
805	#
806	print STDERR "Stripping text from postscript\n";
807	my $errorcode=0;
808	open (IN, "$input_filename")
809	\|\| ($errorcode=1, warn "Couldn't read file: $!");
810	open (OUT, ">$output_filestem.text")
811	\|\| ($errorcode=1, warn "Couldn't write file: $!");
812	if ($errorcode) {print STDERR "errors\n";return 0;}
813
814	my $text=""; # this is for whole .ps file...
815	$text = join('', <IN>); # see man perlport, under "System Resources"
816	close IN;
817
818	# Make sure this is a ps file...
819	if ($text !~ /^%!/) {
820	print STDERR "Bad postscript header: not '%!'\n";
821	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
822	{
823	print FAILLOG "Bad postscript header: not '%!'\n";
824	close FAILLOG;
825	}
826	return 0;
827	}
828
829	# if ps has Page data, then use it to delete all stuff before it.
830	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
831
832	# remove all leading non-data stuff
833	$text =~ s/^.*?\(//s;
834
835	# remove all newline chars for easier processing
836	$text =~ s/\n//g;
837
838	# Big assumption here - assume that if any co-ordinates are
839	# given, then we are at the end of a sentence.
840	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
841
842	# special characters--
843	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
844
845	# ? ps text formatting (eg italics?) ?
846	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
847	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
848	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
849	# default - remove the rest
850	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
851
852	# attempt to add whitespace between words...
853	# this is based purely on observation, and may be completely wrong...
854	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
855	# eg I notice "b(" is sometimes NOT a space if preceded by a
856	# negative number.
857	$text =~ s/\)\d+ ?b\(/\) \( /g;
858
859	# change quoted braces to brackets
860	$text =~ s/([^\\])\\\(/$1\{/g;
861	$text =~ s/([^\\])\\\)/$1\}/g ;
862
863	# remove everything that is not between braces
864	$text =~ s/\)([^\(\)])+?\(//sg ;
865
866	# remove any Trailer eof stuff.
867	$text =~ s/\)[^\)]*$//sg;
868
869	### ligatures have special characters...
870	$text =~ s/\\013/ff/g;
871	$text =~ s/\\014/fi/g;
872	$text =~ s/\\015/fl/g;
873	$text =~ s/\\016/ffi/g;
874	$text =~ s/\\214/fi/g;
875	$text =~ s/\\215/fl/g;
876	$text =~ s/\\017/\n\* /g; # asterisk?
877	$text =~ s/\\023/\023/g; # e acute ('e)
878	$text =~ s/\\177/\252/g; # u"
879	# $text =~ s/ ?? /\344/g; # a"
880
881	print OUT "$text";
882	close OUT;
883	}
884	# wrap the text - use a minimum length. ie, first space after this length.
885	my $wrap_length=72;
886	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
887	open INFILE, "$output_filestem.text.tmp" \|\|
888	die "Couldn't open file: $!";
889	open OUTFILE, ">$output_filestem.text" \|\|
890	die "Couldn't open file for writing: $!";
891	my $line="";
892	while ($line=<INFILE>) {
893	while (length($line)>0) {
894	if (length($line)>$wrap_length) {
895	$line =~ s/^(.{$wrap_length}[^\s])\s//;
896	print OUTFILE "$1\n";
897	} else {
898	print OUTFILE "$line";
899	$line="";
900	}
901	}
902	}
903	close INFILE;
904	close OUTFILE;
905	&util::rm("$output_filestem.text.tmp");
906
907	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
908	return 1;
909	}
910
911
912	# Convert any file to HTML with a crude perl implementation of the
913	# UNIX strings command.
914
915	sub any_to_html {
916	($input_filename, $output_filestem) = @_;
917
918	# First generate a text file
919	return 0 unless (&any_to_text($input_filename, $output_filestem));
920
921	# create an HTML file from the text file
922	open(TEXT, "<$output_filestem.text");
923	open(HTML, ">$output_filestem.html");
924
925	print HTML "<html><head>\n";
926	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
927	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
928	print HTML "</head><body>\n\n";
929
930	my $line;
931	while ($line=<TEXT>) {
932	$line =~ s/</</g;
933	$line =~ s/>/>/g;
934	if ($line =~ /^\s*$/) {
935	print HTML "<p>";
936	} else {
937	print HTML "<br> ", $line;
938	}
939	}
940	print HTML "\n</body></html>\n";
941
942	close HTML;
943	close TEXT;
944
945	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
946	return 1;
947	}
948
949	# Convert any file to TEXT with a crude perl implementation of the
950	# UNIX strings command.
951	# Note - this assumes ascii charsets :( (jrm21)
952
953	sub any_to_text {
954	($input_filename, $output_filestem) = @_;
955
956	open(IN, "<$input_filename") \|\| return 0;
957	binmode(IN);
958	open(OUT, ">$output_filestem.text") \|\| return 0;
959
960	my ($line);
961	my $output_line_count = 0;
962	while (<IN>) {
963	$line = $_;
964
965	# delete anything that isn't a printable character
966	$line =~ s/[^\040-\176]+/\n/sg;
967
968	# delete any string less than 10 characters long
969	$line =~ s/^.{0,9}$/\n/mg;
970	while ($line =~ /^.{1,9}$/m) {
971	$line =~ s/^.{0,9}$/\n/mg;
972	$line =~ s/\n+/\n/sg;
973	}
974
975	# remove extraneous whitespace
976	$line =~ s/\n+/\n/gs;
977	$line =~ s/^\n//gs;
978
979	# output whatever is left
980	if ($line =~ /[^\n ]/) {
981	print OUT $line;
982	++$output_line_count;
983	}
984	}
985
986	close OUT;
987	close IN;
988
989	if ($output_line_count) { # try to protect against binary only formats
990	return 1;
991	}
992
993	&util::rm("$output_filestem.text");
994	return 0;
995
996	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: