Context Navigation

source: trunk/gsdl/bin/script/gsConvert.pl@ 3350

Last change on this file since 3350 was 3350, checked in by sjboddie, 22 years ago
Added -use_strings option to ConvertToPlug. The default behaviour for plugins derived from ConvertToPlug (WordPlug, PDFPlug etc) is now to exclude documents that can't be converted correctly. They won't use the perl strings stuff to extract text unless the -use_strings option is specified.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 26.5 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use parsargv;
52	use util;
53	use Cwd;
54	use File::Basename;
55
56	# Are we running on WinNT or Win2000 (or later)?
57	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
58	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
59
60	my $use_strings;
61
62	sub print_usage
63	{
64	print STDERR "\n";
65	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
66	print STDERR " or text using third-party programs.\n\n";
67	print STDERR " usage: $0 [options] filename\n";
68	print STDERR " options:\n\t-type\tdoc\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
69	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
70	print STDERR "\t-output\thtml\|text\n";
71	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
72	print STDERR "\t-use_strings\t(use strnigs to extract text if conversion fails)\n";
73	exit(1);
74	}
75
76	my $faillogfile="";
77
78	sub main
79	{
80	my (@ARGV) = @_;
81	my ($input_type,$output_type,$verbose,$timeout);
82
83	$timeout = 0;
84	# read command-line arguments
85	if (!parsargv::parse(\@ARGV,
86	'type/(doc\|pdf\|ps\|ppt\|rtf\|xls)/', \$input_type,
87	'/errlog/.*/', \$faillogfile,
88	'output/(html\|text)/', \$output_type,
89	'timeout/\d+/0',\$timeout,
90	'verbose/\d+/0', \$verbose,
91	'use_strings', \$use_strings))
92	{
93	print_usage();
94	}
95
96	# Make sure the input file exists and can be opened for reading
97	if (scalar(@ARGV!=1)) {
98	print_usage();
99	}
100
101	my $input_filename = $ARGV[0];
102	if (!-r $input_filename) {
103	print STDERR "Error: unable to open $input_filename for reading\n";
104	exit(1);
105	}
106
107	# Deduce filenames
108	my ($tailname,$dirname,$suffix)
109	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
110	my $output_filestem = &util::filename_cat($dirname, "$tailname");
111
112	if ($input_type eq "")
113	{
114	$input_type = lc (substr($suffix,1,length($suffix)-1));
115	}
116
117	# Change to temporary working directory
118	my $stored_dir = cwd();
119	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
120
121	# Select convert utility
122	if (!defined $input_type) {
123	print STDERR "Error: No filename extension or input type defined\n";
124	exit(1);
125	}
126	elsif ($input_type eq "doc") {
127	print &convertDOC($input_filename, $output_filestem, $output_type);
128	print "\n";
129	}
130	elsif ($input_type eq "rtf") {
131	print &convertRTF($input_filename, $output_filestem, $output_type);
132	print "\n";
133	}
134	elsif ($input_type eq "pdf") {
135	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
136	print "\n";
137	}
138	elsif ($input_type eq "ps") {
139	print &convertPS($input_filename, $output_filestem, $output_type);
140	print "\n";
141	}
142	elsif ($input_type eq "ppt") {
143	print &convertPPT($input_filename, $output_filestem, $output_type);
144	print "\n";
145	}
146	elsif ($input_type eq "xls") {
147	print &convertXLS($input_filename, $output_filestem, $output_type);
148	print "\n";
149	}
150	else {
151	print STDERR "Error: Unable to convert type '$input_type'\n";
152	exit(1);
153	}
154
155	# restore to original working directory
156	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
157
158	}
159
160	&main(@ARGV);
161
162
163
164	# Document-type conversion functions
165	#
166	# The following functions attempt to convert documents from their
167	# input type to the specified output type. If no output type was
168	# given, then they first attempt HTML, and then TEXT.
169	#
170	# Each returns the output type ("html" or "text") or "fail" if no
171	# conversion is possible.
172
173	# Convert a Microsoft word document
174
175	sub convertDOC {
176	($input_filename, $output_filestem, $output_type) = @_;
177
178	# Many .doc files are not in fact word documents!
179	my $realtype = &find_docfile_type($input_filename);
180
181	if ($realtype eq "word6" \|\| $realtype eq "word7" \|\| $realtype eq "word8") {
182	return &convertWord678($input_filename, $output_filestem, $output_type);
183	} elsif ($realtype eq "rtf") {
184	return &convertRTF($input_filename, $output_filestem, $output_type);
185	} else {
186	return &convertAnything($input_filename, $output_filestem, $output_type);
187	}
188	}
189
190	# Convert a Microsoft word 6/7/8 document
191
192	sub convertWord678 {
193	($input_filename, $output_filestem, $output_type) = @_;
194
195	my $success = 0;
196
197	# Attempt specialised conversion to HTML
198	if (!$output_type \|\| ($output_type =~ /html/i)) {
199	$success = &doc_to_html($input_filename, $output_filestem);
200	if ($success) {
201	return "html";
202	}
203	}
204
205	return &convertAnything($input_filename, $output_filestem, $output_type);
206	}
207
208
209	# Convert a Rich Text Format (RTF) file
210
211	sub convertRTF {
212	($input_filename, $output_filestem, $output_type) = @_;
213
214	my $success = 0;
215
216	# Attempt specialised conversion to HTML
217	if (!$output_type \|\| ($output_type =~ /html/i)) {
218	$success = &rtf_to_html($input_filename, $output_filestem);
219	if ($success) {
220	return "html";
221	}
222	}
223
224	# rtf is so ugly that's it's not worth running strings over.
225	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
226	# return &convertAnything($input_filename, $output_filestem, $output_type);
227	return "fail";
228	}
229
230
231	# Convert an unidentified file
232
233	sub convertAnything {
234	($input_filename, $output_filestem, $output_type) = @_;
235
236	my $success = 0;
237
238	# Attempt simple conversion to HTML
239	if (!$output_type \|\| ($output_type =~ /html/i)) {
240	$success = &any_to_html($input_filename, $output_filestem);
241	if ($success) {
242	return "html";
243	}
244	}
245
246	# Convert to text
247	if (!$output_type \|\| ($output_type =~ /text/i)) {
248	$success = &any_to_text($input_filename, $output_filestem);
249	if ($success) {
250	return "text";
251	}
252	}
253	return "fail";
254	}
255
256
257
258	# Convert an Adobe PDF document
259
260	sub convertPDF {
261	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
262
263	my $success = 0;
264
265	# Attempt conversion to HTML
266	if (!$output_type \|\| ($output_type =~ /html/i)) {
267	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
268	if ($success) {
269	return "html";
270	}
271	}
272
273	# Attempt conversion to TEXT
274	if (!$output_type \|\| ($output_type =~ /text/i)) {
275	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
276	if ($success) {
277	return "text";
278	}
279	}
280
281	return "fail";
282
283	}
284
285
286	# Convert an Adobe PostScript document
287
288	sub convertPS {
289	($input_filename, $output_filestem, $output_type) = @_;
290
291	my $success = 0;
292
293	# Attempt conversion to TEXT
294	if (!$output_type \|\| ($output_type =~ /text/i)) {
295	$success = &ps_to_text($input_filename, $output_filestem);
296	if ($success) {
297	return "text";
298	}
299	}
300
301	return "fail";
302
303	}
304
305
306	sub convertPPT {
307	my ($input_filename, $output_filestem, $output_type) = @_;
308
309	my $success = 0;
310
311	# Attempt conversion to HTML
312	if (!$output_type \|\| ($output_type =~ /html/i)) {
313	# formulate the command
314	$cmd = "";
315	$cmd .= "perl -S ppttohtml.pl ";
316	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
317	$cmd .= " 2>\"$output_filestem.err\""
318	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
319
320
321	# execute the command
322	$!=0;
323	if (system($cmd)!=0)
324	{
325	print STDERR "Powerpoint 95/97 converter failed $!\n";
326	} else {
327	return "html";
328	}
329	}
330
331	$success = &any_to_text($input_filename, $output_filestem);
332	if ($success) {
333	return "text";
334	}
335
336	return "fail";
337	}
338
339
340	sub convertXLS {
341	my ($input_filename, $output_filestem, $output_type) = @_;
342
343	my $success = 0;
344
345	# Attempt conversion to HTML
346	if (!$output_type \|\| ($output_type =~ /html/i)) {
347	# formulate the command
348	$cmd = "";
349	$cmd .= "perl -S xlstohtml.pl ";
350	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
351	$cmd .= " 2>\"$output_filestem.err\""
352	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
353
354
355	# execute the command
356	$!=0;
357	if (system($cmd)!=0)
358	{
359	print STDERR "Excel 95/97 converter failed $!\n";
360	} else {
361	return "html";
362	}
363	}
364
365	$success = &any_to_text($input_filename, $output_filestem);
366	if ($success) {
367	return "text";
368	}
369
370	return "fail";
371	}
372
373
374
375
376
377	# Find the real type of a .doc file
378	#
379	# We seem to have a lot of files with a .doc extension that are .rtf
380	# files or Word 5 files. This function attempts to tell the difference.
381
382	sub find_docfile_type {
383	($input_filename) = @_;
384
385	open(CHK, "<$input_filename");
386	binmode(CHK);
387	my $line = "";
388	my $first = 1;
389
390	while (<CHK>) {
391
392	$line = $_;
393
394	if ($first) {
395	# check to see if this is an rtf file
396	if ($line =~ /^\{\\rtf/) {
397	close(CHK);
398	return "rtf";
399	}
400	$first = 0;
401	}
402
403	# is this is a word 6/7/8 document?
404	if ($line =~ /Word\.Document\.([678])/) {
405	close(CHK);
406	return "word$1";
407	}
408
409	}
410
411	return "unknown";
412	}
413
414
415
416	# Specific type-to-type conversions
417	#
418	# Each of the following functions attempts to convert a document from
419	# a specific format to another. If they succeed they return 1 and leave
420	# the output document(s) in the appropriate place; if they fail they
421	# return 0 and delete any working files.
422
423
424	# Attempt to convert a word document to html with the wv program
425
426	sub doc_to_html {
427	($input_filename, $output_filestem) = @_;
428
429	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
430	$ENV{'GSDLOS'}, "wvWare");
431
432	# don't include path on windows (to avoid having to play about
433	# with quoting when GSDLHOME might contain spaces) but assume
434	# that the PATH is set up correctly
435	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
436
437	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
438	"packages", "wv", "wvHtml.xml");
439
440	my $cmd = "";
441	if ($timeout) {$cmd = "ulimit -t $timeout;";}
442	$cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
443	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
444
445	# redirecting STDERR is a bad idea on windows 95/98
446	$cmd .= " 2> \"$output_filestem.err\""
447	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
448
449	# execute the command
450	$!=0;
451	if (system($cmd)!=0)
452	{
453	print STDERR "Error executing wv converter:$!\n";
454	if (-s "$output_filestem.err") {
455	open (ERRFILE, "<$output_filestem.err");
456
457	my $write_to_fail_log=0;
458	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
459	{$write_to_fail_log=1;}
460
461	my $line;
462	while ($line=<ERRFILE>) {
463	if ($line =~ /\w/) {
464	print STDERR "$line";
465	print FAILLOG "$line" if ($write_to_fail_log);
466	}
467	if ($line !~ m/startup error/) {next;}
468	print STDERR " (given an invalid .DOC file?)\n";
469	print FAILLOG " (given an invalid .DOC file?)\n"
470	if ($write_to_fail_log);
471
472	} # while ERRFILE
473	close FAILLOG if ($write_to_fail_log);
474	}
475	return 0; # we can try any_to_text
476	}
477
478	# Was the conversion successful?
479
480	if (-s "$output_filestem.html") {
481	open(TMP, "$output_filestem.html");
482	$line = <TMP>;
483	close(TMP);
484	if ($line && $line =~ /DOCTYPE HTML/) {
485	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
486	return 1;
487	}
488	}
489
490	# If here, an error of some sort occurred
491	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
492	if (-e "$output_filestem.err") {
493	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
494	open (ERRLOG,"$output_filestem.err");
495	while (<ERRLOG>) {print FAILLOG $_;}
496	close FAILLOG;
497	close ERRLOG;
498	}
499	&util::rm("$output_filestem.err");
500	}
501
502	return 0;
503	}
504
505
506	# Attempt to convert an RTF document to html with rtftohtml
507
508	sub rtf_to_html {
509	my ($input_filename, $output_filestem) = @_;
510
511	# formulate the command
512	$cmd = "";
513	if ($timeout) {$cmd = "ulimit -t $timeout;";}
514	$cmd .= "rtftohtml";
515
516	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
517
518	$cmd .= " 2>\"$output_filestem.err\""
519	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
520
521
522	# execute the command
523	$!=0;
524	if (system($cmd)!=0)
525	{
526	print STDERR "Error executing rtf converter $!\n";
527	# don't currently bother printing out error log...
528	# keep going, in case it still created an HTML file...
529	}
530
531	# Was the conversion successful?
532	my $was_successful=0;
533	if (-s "$output_filestem.html") {
534	# make sure we have some content other than header
535	open (HTML, "$output_filestem.html"); # what to do if fail?
536	my $line;
537	my $past_header=0;
538	while ($line=<HTML>) {
539
540	if ($past_header == 0) {
541	if ($line =~ /<body>/) {$past_header=1;}
542	next;
543	}
544
545	$line =~ s/<[^>]+>//g;
546	if ($line =~ /\w/ && $past_header) { # we found some content...
547	$was_successful=1;
548	last;
549	}
550	}
551	close HTML;
552	}
553
554	if ($was_successful) {
555	&util::rm("$output_filestem.err")
556	if (-e "$output_filestem.err");
557	# insert the (modified) table of contents, if it exists.
558	if (-e "${output_filestem}_ToC.html") {
559	&util::mv("$output_filestem.html","$output_filestem.src");
560	my $open_failed=0;
561	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
562	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
563	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
564
565	if ($open_failed) {
566	close HTMLSRC;
567	close TOC;
568	close HTML;
569	&util::mv("$output_filestem.src","$output_filestem.html");
570	return 1;
571	}
572
573	# print out header info from src html.
574	while (($_ = <HTMLSRC>) =~ /\w/) {
575	print HTML "$_";
576	}
577
578	# print out table of contents, making links relative
579	<TOC>; <TOC>; # ignore first 2 lines
580	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
581	my $line;
582	while ($line=<TOC>) {
583	$line =~ s@</body></html>$@@ ; # only last line has this
584	# make link relative
585	$line =~ s@href=\"[^\#]+@href=\"@;
586	print HTML $line;
587	}
588	close TOC;
589
590	# rest of html src
591	while (<HTMLSRC>) {
592	print HTML $_;
593	}
594	close HTMLSRC;
595	close HTML;
596
597	&util::rm("${output_filestem}_ToC.html");
598	&util::rm("${output_filestem}.src");
599	}
600	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
601	return 1; # success
602	}
603
604	if (-e "$output_filestem.err") {
605	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
606	{
607	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
608	print FAILLOG " (rtf file might be too recent):\n";
609	open (ERRLOG, "$output_filestem.err");
610	while (<ERRLOG>) {print FAILLOG $_;}
611	close ERRLOG;
612	close FAILLOG;
613	}
614	&util::rm("$output_filestem.err");
615	}
616
617	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
618
619	return 0;
620	}
621
622
623	# Convert a pdf file to html with the pdftohtml command
624
625	sub pdf_to_html {
626	my ($dirname, $input_filename, $output_filestem) = @_;
627
628	$cmd = "";
629	if ($timeout) {$cmd = "ulimit -t $timeout;";}
630	$cmd .= "perl -S pdftohtml.pl ";
631	$cmd .= " \"$input_filename\" \"$output_filestem\"";
632
633	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000) {
634	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
635	} else {
636	$cmd .= " > \"$output_filestem.err\"";
637	}
638
639	$!=0;
640
641	my $retval=system($cmd);
642	if ($retval!=0)
643	{
644	print STDERR "Error executing pdftohtml.pl";
645	if ($!) {print STDERR ": $!";}
646	print STDERR "\n";
647	}
648
649	# make sure the converter made something
650	if ($retval!=0 \|\| ! -s "$output_filestem.html")
651	{
652	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
653	# print out the converter's std err, if any
654	if (-s "$output_filestem.err") {
655	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
656	print STDERR "pdftohtml error log:\n";
657	while (<ERRLOG>) {
658	print STDERR "$_";
659	}
660	close ERRLOG;
661	}
662	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
663	if (-e "$output_filestem.err") {
664	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
665	{
666	open (ERRLOG, "$output_filestem.err");
667	while (<ERRLOG>) {print FAILLOG $_;}
668	close ERRLOG;
669	close FAILLOG;
670	}
671	&util::rm("$output_filestem.err");
672	}
673	return 0;
674	}
675
676	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
677	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
678	return 1;
679	}
680
681	# Convert a PDF file to text with the pdftotext command
682
683	sub pdf_to_text {
684	my ($dirname, $input_filename, $output_filestem) = @_;
685
686	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
687
688	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
689	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
690	} else {
691	$cmd .= " > \"$output_filestem.err\"";
692	}
693
694	if (system($cmd)!=0)
695	{
696	print STDERR "Error executing $cmd: $!\n";
697	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
698	}
699
700	# make sure there is some extracted text.
701	if (-e "$output_filestem.text") {
702	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
703	binmode(EXTR_TEXT); # just in case...
704	my $line="";
705	my $seen_text=0;
706	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
707	if ($line=~ /\w/) {$seen_text=1;}
708	}
709	close EXTR_TEXT;
710	if ($seen_text==0) { # no text was extracted
711	print STDERR "Error: pdftotext found no text\n";
712	&util::rm("$output_filestem.text");
713	}
714	}
715
716	# make sure the converter made something
717	if (! -s "$output_filestem.text")
718	{
719	# print out the converters std err, if any
720	if (-s "$output_filestem.err") {
721	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
722	print STDERR "pdftotext error log:\n";
723	while (<ERRLOG>) {
724	print STDERR "$_";
725	}
726	close ERRLOG;
727	}
728	# does this converter create a .out file?
729	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
730	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
731	if (-e "$output_filestem.err") {
732	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
733	{
734	open (ERRLOG,"$output_filestem.err");
735	while (<ERRLOG>) {print FAILLOG $_;}
736	close ERRLOG;
737	close FAILLOG;
738	}
739	&util::rm("$output_filestem.err");
740	}
741	return 0;
742	}
743	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
744	return 1;
745	}
746
747	# Convert a PostScript document to text
748	# note - just using "ps2ascii" isn't good enough, as it
749	# returns 0 for a postscript interpreter error. ps2ascii is just
750	# a wrapper to "gs" anyway, so we use that cmd here.
751
752	sub ps_to_text {
753	my ($input_filename, $output_filestem) = @_;
754
755	my $error = "";
756
757	# if we're on windows we'll fall straight through without attempting
758	# to use gs
759	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
760	$error = "Windows does not support gs";
761
762	} else {
763	my $cmd = "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
764	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
765	$cmd .= " 2> $output_filestem.err";
766	$!=0;
767
768	my $retcode=system($cmd);
769	$retcode = $? >> 8; # see man perlfunc - system for this...
770	# if system returns -1 \| 127 (couldn't start program), look at $! for message
771
772	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
773	elsif (! -e "$output_filestem.text") {
774	$error="did not create output file.\n";
775	}
776	else
777	{ # make sure the interpreter didn't get an error. It is technically
778	# possible for the actual text to start with this, but....
779	open PSOUT, "$output_filestem.text";
780	if (<PSOUT> =~ /^Error: (.*)/) {
781	$error="interpreter error - \"$1\"";
782	}
783	close PSOUT;
784	}
785	}
786
787	if ($error ne "")
788	{
789	print STDERR "Warning: Error executing gs: $error\n";
790	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
791
792	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
793	{
794	print FAILLOG "gs - $error\n";
795	if (-e "$output_filestem.err") {
796	open(ERRLOG, "$output_filestem.err");
797	while (<ERRLOG>) {print FAILLOG $_;}
798	close ERRLOG;
799	}
800	close FAILLOG;
801	}
802	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
803
804
805	# Fine then. We'll just do a lousy job by ourselves...
806	# Based on 5-line regexp sed script found at:
807	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
808	#
809	print STDERR "Stripping text from postscript\n";
810	my $errorcode=0;
811	open (IN, "$input_filename")
812	\|\| ($errorcode=1, warn "Couldn't read file: $!");
813	open (OUT, ">$output_filestem.text")
814	\|\| ($errorcode=1, warn "Couldn't write file: $!");
815	if ($errorcode) {print STDERR "errors\n";return 0;}
816
817	my $text=""; # this is for whole .ps file...
818	$text = join('', <IN>); # see man perlport, under "System Resources"
819	close IN;
820
821	# Make sure this is a ps file...
822	if ($text !~ /^%!/) {
823	print STDERR "Bad postscript header: not '%!'\n";
824	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
825	{
826	print FAILLOG "Bad postscript header: not '%!'\n";
827	close FAILLOG;
828	}
829	return 0;
830	}
831
832	# if ps has Page data, then use it to delete all stuff before it.
833	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
834
835	# remove all leading non-data stuff
836	$text =~ s/^.*?\(//s;
837
838	# remove all newline chars for easier processing
839	$text =~ s/\n//g;
840
841	# Big assumption here - assume that if any co-ordinates are
842	# given, then we are at the end of a sentence.
843	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
844
845	# special characters--
846	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
847
848	# ? ps text formatting (eg italics?) ?
849	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
850	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
851	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
852	# default - remove the rest
853	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
854
855	# attempt to add whitespace between words...
856	# this is based purely on observation, and may be completely wrong...
857	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
858	# eg I notice "b(" is sometimes NOT a space if preceded by a
859	# negative number.
860	$text =~ s/\)\d+ ?b\(/\) \( /g;
861
862	# change quoted braces to brackets
863	$text =~ s/([^\\])\\\(/$1\{/g;
864	$text =~ s/([^\\])\\\)/$1\}/g ;
865
866	# remove everything that is not between braces
867	$text =~ s/\)([^\(\)])+?\(//sg ;
868
869	# remove any Trailer eof stuff.
870	$text =~ s/\)[^\)]*$//sg;
871
872	### ligatures have special characters...
873	$text =~ s/\\013/ff/g;
874	$text =~ s/\\014/fi/g;
875	$text =~ s/\\015/fl/g;
876	$text =~ s/\\016/ffi/g;
877	$text =~ s/\\214/fi/g;
878	$text =~ s/\\215/fl/g;
879	$text =~ s/\\017/\n\* /g; # asterisk?
880	$text =~ s/\\023/\023/g; # e acute ('e)
881	$text =~ s/\\177/\252/g; # u"
882	# $text =~ s/ ?? /\344/g; # a"
883
884	print OUT "$text";
885	close OUT;
886	}
887	# wrap the text - use a minimum length. ie, first space after this length.
888	my $wrap_length=72;
889	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
890	open INFILE, "$output_filestem.text.tmp" \|\|
891	die "Couldn't open file: $!";
892	open OUTFILE, ">$output_filestem.text" \|\|
893	die "Couldn't open file for writing: $!";
894	my $line="";
895	while ($line=<INFILE>) {
896	while (length($line)>0) {
897	if (length($line)>$wrap_length) {
898	$line =~ s/^(.{$wrap_length}[^\s])\s//;
899	print OUTFILE "$1\n";
900	} else {
901	print OUTFILE "$line";
902	$line="";
903	}
904	}
905	}
906	close INFILE;
907	close OUTFILE;
908	&util::rm("$output_filestem.text.tmp");
909
910	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
911	return 1;
912	}
913
914
915	# Convert any file to HTML with a crude perl implementation of the
916	# UNIX strings command.
917
918	sub any_to_html {
919	($input_filename, $output_filestem) = @_;
920
921	# First generate a text file
922	return 0 unless (&any_to_text($input_filename, $output_filestem));
923
924	# create an HTML file from the text file
925	open(TEXT, "<$output_filestem.text");
926	open(HTML, ">$output_filestem.html");
927
928	print HTML "<html><head>\n";
929	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
930	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
931	print HTML "</head><body>\n\n";
932
933	my $line;
934	while ($line=<TEXT>) {
935	$line =~ s/</</g;
936	$line =~ s/>/>/g;
937	if ($line =~ /^\s*$/) {
938	print HTML "<p>";
939	} else {
940	print HTML "<br> ", $line;
941	}
942	}
943	print HTML "\n</body></html>\n";
944
945	close HTML;
946	close TEXT;
947
948	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
949	return 1;
950	}
951
952	# Convert any file to TEXT with a crude perl implementation of the
953	# UNIX strings command.
954	# Note - this assumes ascii charsets :( (jrm21)
955
956	sub any_to_text {
957	($input_filename, $output_filestem) = @_;
958
959	if (!$use_strings) {
960	return 0;
961	}
962
963	open(IN, "<$input_filename") \|\| return 0;
964	binmode(IN);
965	open(OUT, ">$output_filestem.text") \|\| return 0;
966
967	my ($line);
968	my $output_line_count = 0;
969	while (<IN>) {
970	$line = $_;
971
972	# delete anything that isn't a printable character
973	$line =~ s/[^\040-\176]+/\n/sg;
974
975	# delete any string less than 10 characters long
976	$line =~ s/^.{0,9}$/\n/mg;
977	while ($line =~ /^.{1,9}$/m) {
978	$line =~ s/^.{0,9}$/\n/mg;
979	$line =~ s/\n+/\n/sg;
980	}
981
982	# remove extraneous whitespace
983	$line =~ s/\n+/\n/gs;
984	$line =~ s/^\n//gs;
985
986	# output whatever is left
987	if ($line =~ /[^\n ]/) {
988	print OUT $line;
989	++$output_line_count;
990	}
991	}
992
993	close OUT;
994	close IN;
995
996	if ($output_line_count) { # try to protect against binary only formats
997	return 1;
998	}
999
1000	&util::rm("$output_filestem.text");
1001	return 0;
1002
1003	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: