Context Navigation

source: trunk/gsdl/bin/script/gsConvert.pl@ 4103

Last change on this file since 4103 was 4103, checked in by sjboddie, 21 years ago
Added a -nohidden PDFPlug option and made it pass the -hidden option to pdftohtml by default.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 27.3 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use parsargv;
52	use util;
53	use Cwd;
54	use File::Basename;
55
56	# Are we running on WinNT or Win2000 (or later)?
57	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
58	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
59
60	my $use_strings;
61	my $pdf_complex;
62	my $pdf_nohidden;
63	my $pdf_zoom;
64	my $pdf_ignore_images;
65
66	sub print_usage
67	{
68	print STDERR "\n";
69	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
70	print STDERR " or text using third-party programs.\n\n";
71	print STDERR " usage: $0 [options] filename\n";
72	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
73	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
74	print STDERR "\t-output\thtml\|text\n";
75	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
76	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
77	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
78	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
79	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
80	print STDERR "\t\tconverting PDF to HTML\n";
81	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
82	print STDERR "\t\t-pdf_complex is set\n";
83	exit(1);
84	}
85
86	my $faillogfile="";
87	my $timeout=0;
88
89	sub main
90	{
91	my (@ARGV) = @_;
92	my ($input_type,$output_type,$verbose);
93
94	# read command-line arguments
95	if (!parsargv::parse(\@ARGV,
96	'type/(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)/', \$input_type,
97	'/errlog/.*/', \$faillogfile,
98	'output/(html\|text)/', \$output_type,
99	'timeout/\d+/0',\$timeout,
100	'verbose/\d+/0', \$verbose,
101	'use_strings', \$use_strings,
102	'pdf_complex', \$pdf_complex,
103	'pdf_nohidden', \$pdf_nohidden,
104	'pdf_zoom/\d+/2', \$pdf_zoom
105	))
106	{
107	print_usage();
108	}
109
110	# Make sure the input file exists and can be opened for reading
111	if (scalar(@ARGV!=1)) {
112	print_usage();
113	}
114
115	my $input_filename = $ARGV[0];
116	if (!-r $input_filename) {
117	print STDERR "Error: unable to open $input_filename for reading\n";
118	exit(1);
119	}
120
121	# Deduce filenames
122	my ($tailname,$dirname,$suffix)
123	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
124	my $output_filestem = &util::filename_cat($dirname, "$tailname");
125
126	if ($input_type eq "")
127	{
128	$input_type = lc (substr($suffix,1,length($suffix)-1));
129	}
130
131	# Change to temporary working directory
132	my $stored_dir = cwd();
133	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
134
135	# Select convert utility
136	if (!defined $input_type) {
137	print STDERR "Error: No filename extension or input type defined\n";
138	exit(1);
139	}
140	elsif ($input_type eq "doc" \|\| $input_type eq "dot") {
141	print &convertDOC($input_filename, $output_filestem, $output_type);
142	print "\n";
143	}
144	elsif ($input_type eq "rtf") {
145	print &convertRTF($input_filename, $output_filestem, $output_type);
146	print "\n";
147	}
148	elsif ($input_type eq "pdf") {
149	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
150	print "\n";
151	}
152	elsif ($input_type eq "ps") {
153	print &convertPS($input_filename, $output_filestem, $output_type);
154	print "\n";
155	}
156	elsif ($input_type eq "ppt") {
157	print &convertPPT($input_filename, $output_filestem, $output_type);
158	print "\n";
159	}
160	elsif ($input_type eq "xls") {
161	print &convertXLS($input_filename, $output_filestem, $output_type);
162	print "\n";
163	}
164	else {
165	print STDERR "Error: Unable to convert type '$input_type'\n";
166	exit(1);
167	}
168
169	# restore to original working directory
170	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
171
172	}
173
174	&main(@ARGV);
175
176
177
178	# Document-type conversion functions
179	#
180	# The following functions attempt to convert documents from their
181	# input type to the specified output type. If no output type was
182	# given, then they first attempt HTML, and then TEXT.
183	#
184	# Each returns the output type ("html" or "text") or "fail" if no
185	# conversion is possible.
186
187	# Convert a Microsoft word document
188
189	sub convertDOC {
190	($input_filename, $output_filestem, $output_type) = @_;
191
192	# Many .doc files are not in fact word documents!
193	my $realtype = &find_docfile_type($input_filename);
194
195	if ($realtype eq "word6" \|\| $realtype eq "word7" \|\| $realtype eq "word8") {
196	return &convertWord678($input_filename, $output_filestem, $output_type);
197	} elsif ($realtype eq "rtf") {
198	return &convertRTF($input_filename, $output_filestem, $output_type);
199	} else {
200	return &convertAnything($input_filename, $output_filestem, $output_type);
201	}
202	}
203
204	# Convert a Microsoft word 6/7/8 document
205
206	sub convertWord678 {
207	($input_filename, $output_filestem, $output_type) = @_;
208
209	my $success = 0;
210
211	# Attempt specialised conversion to HTML
212	if (!$output_type \|\| ($output_type =~ /html/i)) {
213	$success = &doc_to_html($input_filename, $output_filestem);
214	if ($success) {
215	return "html";
216	}
217	}
218
219	return &convertAnything($input_filename, $output_filestem, $output_type);
220	}
221
222
223	# Convert a Rich Text Format (RTF) file
224
225	sub convertRTF {
226	($input_filename, $output_filestem, $output_type) = @_;
227
228	my $success = 0;
229
230	# Attempt specialised conversion to HTML
231	if (!$output_type \|\| ($output_type =~ /html/i)) {
232	$success = &rtf_to_html($input_filename, $output_filestem);
233	if ($success) {
234	return "html";
235	}
236	}
237
238	# rtf is so ugly that's it's not worth running strings over.
239	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
240	# return &convertAnything($input_filename, $output_filestem, $output_type);
241	return "fail";
242	}
243
244
245	# Convert an unidentified file
246
247	sub convertAnything {
248	($input_filename, $output_filestem, $output_type) = @_;
249
250	my $success = 0;
251
252	# Attempt simple conversion to HTML
253	if (!$output_type \|\| ($output_type =~ /html/i)) {
254	$success = &any_to_html($input_filename, $output_filestem);
255	if ($success) {
256	return "html";
257	}
258	}
259
260	# Convert to text
261	if (!$output_type \|\| ($output_type =~ /text/i)) {
262	$success = &any_to_text($input_filename, $output_filestem);
263	if ($success) {
264	return "text";
265	}
266	}
267	return "fail";
268	}
269
270
271
272	# Convert an Adobe PDF document
273
274	sub convertPDF {
275	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
276
277	my $success = 0;
278
279	# Attempt conversion to HTML
280	if (!$output_type \|\| ($output_type =~ /html/i)) {
281	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
282	if ($success) {
283	return "html";
284	}
285	}
286
287	# Attempt conversion to TEXT
288	if (!$output_type \|\| ($output_type =~ /text/i)) {
289	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
290	if ($success) {
291	return "text";
292	}
293	}
294
295	return "fail";
296
297	}
298
299
300	# Convert an Adobe PostScript document
301
302	sub convertPS {
303	($input_filename, $output_filestem, $output_type) = @_;
304
305	my $success = 0;
306
307	# Attempt conversion to TEXT
308	if (!$output_type \|\| ($output_type =~ /text/i)) {
309	$success = &ps_to_text($input_filename, $output_filestem);
310	if ($success) {
311	return "text";
312	}
313	}
314
315	return "fail";
316
317	}
318
319
320	sub convertPPT {
321	my ($input_filename, $output_filestem, $output_type) = @_;
322
323	my $success = 0;
324
325	# Attempt conversion to HTML
326	if (!$output_type \|\| ($output_type =~ /html/i)) {
327	# formulate the command
328	$cmd = "";
329	$cmd .= "perl -S ppttohtml.pl ";
330	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
331	$cmd .= " 2>\"$output_filestem.err\""
332	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
333
334
335	# execute the command
336	$!=0;
337	if (system($cmd)!=0)
338	{
339	print STDERR "Powerpoint 95/97 converter failed $!\n";
340	} else {
341	return "html";
342	}
343	}
344
345	$success = &any_to_text($input_filename, $output_filestem);
346	if ($success) {
347	return "text";
348	}
349
350	return "fail";
351	}
352
353
354	sub convertXLS {
355	my ($input_filename, $output_filestem, $output_type) = @_;
356
357	my $success = 0;
358
359	# Attempt conversion to HTML
360	if (!$output_type \|\| ($output_type =~ /html/i)) {
361	# formulate the command
362	$cmd = "";
363	$cmd .= "perl -S xlstohtml.pl ";
364	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
365	$cmd .= " 2>\"$output_filestem.err\""
366	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
367
368
369	# execute the command
370	$!=0;
371	if (system($cmd)!=0)
372	{
373	print STDERR "Excel 95/97 converter failed $!\n";
374	} else {
375	return "html";
376	}
377	}
378
379	$success = &any_to_text($input_filename, $output_filestem);
380	if ($success) {
381	return "text";
382	}
383
384	return "fail";
385	}
386
387
388
389
390
391	# Find the real type of a .doc file
392	#
393	# We seem to have a lot of files with a .doc extension that are .rtf
394	# files or Word 5 files. This function attempts to tell the difference.
395
396	sub find_docfile_type {
397	($input_filename) = @_;
398
399	open(CHK, "<$input_filename");
400	binmode(CHK);
401	my $line = "";
402	my $first = 1;
403
404	while (<CHK>) {
405
406	$line = $_;
407
408	if ($first) {
409	# check to see if this is an rtf file
410	if ($line =~ /^\{\\rtf/) {
411	close(CHK);
412	return "rtf";
413	}
414	$first = 0;
415	}
416
417	# is this is a word 6/7/8 document?
418	if ($line =~ /Word\.Document\.([678])/) {
419	close(CHK);
420	return "word$1";
421	}
422
423	}
424
425	return "unknown";
426	}
427
428
429
430	# Specific type-to-type conversions
431	#
432	# Each of the following functions attempts to convert a document from
433	# a specific format to another. If they succeed they return 1 and leave
434	# the output document(s) in the appropriate place; if they fail they
435	# return 0 and delete any working files.
436
437
438	# Attempt to convert a word document to html with the wv program
439
440	sub doc_to_html {
441	($input_filename, $output_filestem) = @_;
442
443	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
444	$ENV{'GSDLOS'}, "wvWare");
445
446	# don't include path on windows (to avoid having to play about
447	# with quoting when GSDLHOME might contain spaces) but assume
448	# that the PATH is set up correctly
449	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
450
451	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
452	"packages", "wv", "wvHtml.xml");
453
454	my $cmd = "";
455	if ($timeout) {$cmd = "ulimit -t $timeout;";}
456	$cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
457	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
458
459	# redirecting STDERR is a bad idea on windows 95/98
460	$cmd .= " 2> \"$output_filestem.err\""
461	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
462
463	# execute the command
464	$!=0;
465	if (system($cmd)!=0)
466	{
467	print STDERR "Error executing wv converter:$!\n";
468	if (-s "$output_filestem.err") {
469	open (ERRFILE, "<$output_filestem.err");
470
471	my $write_to_fail_log=0;
472	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
473	{$write_to_fail_log=1;}
474
475	my $line;
476	while ($line=<ERRFILE>) {
477	if ($line =~ /\w/) {
478	print STDERR "$line";
479	print FAILLOG "$line" if ($write_to_fail_log);
480	}
481	if ($line !~ m/startup error/) {next;}
482	print STDERR " (given an invalid .DOC file?)\n";
483	print FAILLOG " (given an invalid .DOC file?)\n"
484	if ($write_to_fail_log);
485
486	} # while ERRFILE
487	close FAILLOG if ($write_to_fail_log);
488	}
489	return 0; # we can try any_to_text
490	}
491
492	# Was the conversion successful?
493
494	if (-s "$output_filestem.html") {
495	open(TMP, "$output_filestem.html");
496	$line = <TMP>;
497	close(TMP);
498	if ($line && $line =~ /DOCTYPE HTML/) {
499	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
500	return 1;
501	}
502	}
503
504	# If here, an error of some sort occurred
505	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
506	if (-e "$output_filestem.err") {
507	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
508	open (ERRLOG,"$output_filestem.err");
509	while (<ERRLOG>) {print FAILLOG $_;}
510	close FAILLOG;
511	close ERRLOG;
512	}
513	&util::rm("$output_filestem.err");
514	}
515
516	return 0;
517	}
518
519
520	# Attempt to convert an RTF document to html with rtftohtml
521
522	sub rtf_to_html {
523	my ($input_filename, $output_filestem) = @_;
524
525	# formulate the command
526	$cmd = "";
527	if ($timeout) {$cmd = "ulimit -t $timeout;";}
528	$cmd .= "rtftohtml";
529
530	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
531
532	$cmd .= " 2>\"$output_filestem.err\""
533	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
534
535
536	# execute the command
537	$!=0;
538	if (system($cmd)!=0)
539	{
540	print STDERR "Error executing rtf converter $!\n";
541	# don't currently bother printing out error log...
542	# keep going, in case it still created an HTML file...
543	}
544
545	# Was the conversion successful?
546	my $was_successful=0;
547	if (-s "$output_filestem.html") {
548	# make sure we have some content other than header
549	open (HTML, "$output_filestem.html"); # what to do if fail?
550	my $line;
551	my $past_header=0;
552	while ($line=<HTML>) {
553
554	if ($past_header == 0) {
555	if ($line =~ /<body>/) {$past_header=1;}
556	next;
557	}
558
559	$line =~ s/<[^>]+>//g;
560	if ($line =~ /\w/ && $past_header) { # we found some content...
561	$was_successful=1;
562	last;
563	}
564	}
565	close HTML;
566	}
567
568	if ($was_successful) {
569	&util::rm("$output_filestem.err")
570	if (-e "$output_filestem.err");
571	# insert the (modified) table of contents, if it exists.
572	if (-e "${output_filestem}_ToC.html") {
573	&util::mv("$output_filestem.html","$output_filestem.src");
574	my $open_failed=0;
575	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
576	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
577	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
578
579	if ($open_failed) {
580	close HTMLSRC;
581	close TOC;
582	close HTML;
583	&util::mv("$output_filestem.src","$output_filestem.html");
584	return 1;
585	}
586
587	# print out header info from src html.
588	while (defined($_ = <HTMLSRC>) && $_ =~ /\w/) {
589	print HTML "$_";
590	}
591
592	# print out table of contents, making links relative
593	<TOC>; <TOC>; # ignore first 2 lines
594	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
595	my $line;
596	while ($line=<TOC>) {
597	$line =~ s@</body></html>$@@ ; # only last line has this
598	# make link relative
599	$line =~ s@href=\"[^\#]+@href=\"@;
600	print HTML $line;
601	}
602	close TOC;
603
604	# rest of html src
605	while (<HTMLSRC>) {
606	print HTML $_;
607	}
608	close HTMLSRC;
609	close HTML;
610
611	&util::rm("${output_filestem}_ToC.html");
612	&util::rm("${output_filestem}.src");
613	}
614	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
615	return 1; # success
616	}
617
618	if (-e "$output_filestem.err") {
619	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
620	{
621	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
622	print FAILLOG " (rtf file might be too recent):\n";
623	open (ERRLOG, "$output_filestem.err");
624	while (<ERRLOG>) {print FAILLOG $_;}
625	close ERRLOG;
626	close FAILLOG;
627	}
628	&util::rm("$output_filestem.err");
629	}
630
631	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
632
633	return 0;
634	}
635
636
637	# Convert a pdf file to html with the pdftohtml command
638
639	sub pdf_to_html {
640	my ($dirname, $input_filename, $output_filestem) = @_;
641
642	$cmd = "";
643	if ($timeout) {$cmd = "ulimit -t $timeout;";}
644	$cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
645	$cmd .= " -c" if ($pdf_complex);
646	$cmd .= " -i" if ($pdf_ignore_images);
647	$cmd .= " -hidden" unless ($pdf_nohidden);
648	$cmd .= " \"$input_filename\" \"$output_filestem\"";
649
650	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000) {
651	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
652	} else {
653	$cmd .= " > \"$output_filestem.err\"";
654	}
655
656	$!=0;
657
658	my $retval=system($cmd);
659	if ($retval!=0)
660	{
661	print STDERR "Error executing pdftohtml.pl";
662	if ($!) {print STDERR ": $!";}
663	print STDERR "\n";
664	}
665
666	# make sure the converter made something
667	if ($retval!=0 \|\| ! -s "$output_filestem.html")
668	{
669	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
670	# print out the converter's std err, if any
671	if (-s "$output_filestem.err") {
672	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
673	print STDERR "pdftohtml error log:\n";
674	while (<ERRLOG>) {
675	print STDERR "$_";
676	}
677	close ERRLOG;
678	}
679	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
680	if (-e "$output_filestem.err") {
681	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
682	{
683	open (ERRLOG, "$output_filestem.err");
684	while (<ERRLOG>) {print FAILLOG $_;}
685	close ERRLOG;
686	close FAILLOG;
687	}
688	&util::rm("$output_filestem.err");
689	}
690	return 0;
691	}
692
693	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
694	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
695	return 1;
696	}
697
698	# Convert a PDF file to text with the pdftotext command
699
700	sub pdf_to_text {
701	my ($dirname, $input_filename, $output_filestem) = @_;
702
703	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
704
705	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
706	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
707	} else {
708	$cmd .= " > \"$output_filestem.err\"";
709	}
710
711	if (system($cmd)!=0)
712	{
713	print STDERR "Error executing $cmd: $!\n";
714	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
715	}
716
717	# make sure there is some extracted text.
718	if (-e "$output_filestem.text") {
719	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
720	binmode(EXTR_TEXT); # just in case...
721	my $line="";
722	my $seen_text=0;
723	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
724	if ($line=~ /\w/) {$seen_text=1;}
725	}
726	close EXTR_TEXT;
727	if ($seen_text==0) { # no text was extracted
728	print STDERR "Error: pdftotext found no text\n";
729	&util::rm("$output_filestem.text");
730	}
731	}
732
733	# make sure the converter made something
734	if (! -s "$output_filestem.text")
735	{
736	# print out the converters std err, if any
737	if (-s "$output_filestem.err") {
738	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
739	print STDERR "pdftotext error log:\n";
740	while (<ERRLOG>) {
741	print STDERR "$_";
742	}
743	close ERRLOG;
744	}
745	# does this converter create a .out file?
746	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
747	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
748	if (-e "$output_filestem.err") {
749	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
750	{
751	open (ERRLOG,"$output_filestem.err");
752	while (<ERRLOG>) {print FAILLOG $_;}
753	close ERRLOG;
754	close FAILLOG;
755	}
756	&util::rm("$output_filestem.err");
757	}
758	return 0;
759	}
760	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
761	return 1;
762	}
763
764	# Convert a PostScript document to text
765	# note - just using "ps2ascii" isn't good enough, as it
766	# returns 0 for a postscript interpreter error. ps2ascii is just
767	# a wrapper to "gs" anyway, so we use that cmd here.
768
769	sub ps_to_text {
770	my ($input_filename, $output_filestem) = @_;
771
772	my $error = "";
773
774	# if we're on windows we'll fall straight through without attempting
775	# to use gs
776	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
777	$error = "Windows does not support gs";
778
779	} else {
780	my $cmd = "";
781	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
782	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
783	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
784	$cmd .= " 2> $output_filestem.err";
785	$!=0;
786
787	my $retcode=system($cmd);
788	$retcode = $? >> 8; # see man perlfunc - system for this...
789	# if system returns -1 \| 127 (couldn't start program), look at $! for message
790
791	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
792	elsif (! -e "$output_filestem.text") {
793	$error="did not create output file.\n";
794	}
795	else
796	{ # make sure the interpreter didn't get an error. It is technically
797	# possible for the actual text to start with this, but....
798	open PSOUT, "$output_filestem.text";
799	if (<PSOUT> =~ /^Error: (.*)/) {
800	$error="interpreter error - \"$1\"";
801	}
802	close PSOUT;
803	}
804	}
805
806	if ($error ne "")
807	{
808	print STDERR "Warning: Error executing gs: $error\n";
809	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
810
811	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
812	{
813	print FAILLOG "gs - $error\n";
814	if (-e "$output_filestem.err") {
815	open(ERRLOG, "$output_filestem.err");
816	while (<ERRLOG>) {print FAILLOG $_;}
817	close ERRLOG;
818	}
819	close FAILLOG;
820	}
821	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
822
823
824	# Fine then. We'll just do a lousy job by ourselves...
825	# Based on 5-line regexp sed script found at:
826	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
827	#
828	print STDERR "Stripping text from postscript\n";
829	my $errorcode=0;
830	open (IN, "$input_filename")
831	\|\| ($errorcode=1, warn "Couldn't read file: $!");
832	open (OUT, ">$output_filestem.text")
833	\|\| ($errorcode=1, warn "Couldn't write file: $!");
834	if ($errorcode) {print STDERR "errors\n";return 0;}
835
836	my $text=""; # this is for whole .ps file...
837	$text = join('', <IN>); # see man perlport, under "System Resources"
838	close IN;
839
840	# Make sure this is a ps file...
841	if ($text !~ /^%!/) {
842	print STDERR "Bad postscript header: not '%!'\n";
843	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
844	{
845	print FAILLOG "Bad postscript header: not '%!'\n";
846	close FAILLOG;
847	}
848	return 0;
849	}
850
851	# if ps has Page data, then use it to delete all stuff before it.
852	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
853
854	# remove all leading non-data stuff
855	$text =~ s/^.*?\(//s;
856
857	# remove all newline chars for easier processing
858	$text =~ s/\n//g;
859
860	# Big assumption here - assume that if any co-ordinates are
861	# given, then we are at the end of a sentence.
862	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
863
864	# special characters--
865	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
866
867	# ? ps text formatting (eg italics?) ?
868	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
869	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
870	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
871	# default - remove the rest
872	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
873
874	# attempt to add whitespace between words...
875	# this is based purely on observation, and may be completely wrong...
876	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
877	# eg I notice "b(" is sometimes NOT a space if preceded by a
878	# negative number.
879	$text =~ s/\)\d+ ?b\(/\) \( /g;
880
881	# change quoted braces to brackets
882	$text =~ s/([^\\])\\\(/$1\{/g;
883	$text =~ s/([^\\])\\\)/$1\}/g ;
884
885	# remove everything that is not between braces
886	$text =~ s/\)([^\(\)])+?\(//sg ;
887
888	# remove any Trailer eof stuff.
889	$text =~ s/\)[^\)]*$//sg;
890
891	### ligatures have special characters...
892	$text =~ s/\\013/ff/g;
893	$text =~ s/\\014/fi/g;
894	$text =~ s/\\015/fl/g;
895	$text =~ s/\\016/ffi/g;
896	$text =~ s/\\214/fi/g;
897	$text =~ s/\\215/fl/g;
898	$text =~ s/\\017/\n\* /g; # asterisk?
899	$text =~ s/\\023/\023/g; # e acute ('e)
900	$text =~ s/\\177/\252/g; # u"
901	# $text =~ s/ ?? /\344/g; # a"
902
903	print OUT "$text";
904	close OUT;
905	}
906	# wrap the text - use a minimum length. ie, first space after this length.
907	my $wrap_length=72;
908	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
909	open INFILE, "$output_filestem.text.tmp" \|\|
910	die "Couldn't open file: $!";
911	open OUTFILE, ">$output_filestem.text" \|\|
912	die "Couldn't open file for writing: $!";
913	my $line="";
914	while ($line=<INFILE>) {
915	while (length($line)>0) {
916	if (length($line)>$wrap_length) {
917	$line =~ s/^(.{$wrap_length}[^\s])\s//;
918	print OUTFILE "$1\n";
919	} else {
920	print OUTFILE "$line";
921	$line="";
922	}
923	}
924	}
925	close INFILE;
926	close OUTFILE;
927	&util::rm("$output_filestem.text.tmp");
928
929	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
930	return 1;
931	}
932
933
934	# Convert any file to HTML with a crude perl implementation of the
935	# UNIX strings command.
936
937	sub any_to_html {
938	($input_filename, $output_filestem) = @_;
939
940	# First generate a text file
941	return 0 unless (&any_to_text($input_filename, $output_filestem));
942
943	# create an HTML file from the text file
944	open(TEXT, "<$output_filestem.text");
945	open(HTML, ">$output_filestem.html");
946
947	print HTML "<html><head>\n";
948	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
949	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
950	print HTML "</head><body>\n\n";
951
952	my $line;
953	while ($line=<TEXT>) {
954	$line =~ s/</</g;
955	$line =~ s/>/>/g;
956	if ($line =~ /^\s*$/) {
957	print HTML "<p>";
958	} else {
959	print HTML "<br> ", $line;
960	}
961	}
962	print HTML "\n</body></html>\n";
963
964	close HTML;
965	close TEXT;
966
967	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
968	return 1;
969	}
970
971	# Convert any file to TEXT with a crude perl implementation of the
972	# UNIX strings command.
973	# Note - this assumes ascii charsets :( (jrm21)
974
975	sub any_to_text {
976	($input_filename, $output_filestem) = @_;
977
978	if (!$use_strings) {
979	return 0;
980	}
981
982	open(IN, "<$input_filename") \|\| return 0;
983	binmode(IN);
984	open(OUT, ">$output_filestem.text") \|\| return 0;
985
986	my ($line);
987	my $output_line_count = 0;
988	while (<IN>) {
989	$line = $_;
990
991	# delete anything that isn't a printable character
992	$line =~ s/[^\040-\176]+/\n/sg;
993
994	# delete any string less than 10 characters long
995	$line =~ s/^.{0,9}$/\n/mg;
996	while ($line =~ /^.{1,9}$/m) {
997	$line =~ s/^.{0,9}$/\n/mg;
998	$line =~ s/\n+/\n/sg;
999	}
1000
1001	# remove extraneous whitespace
1002	$line =~ s/\n+/\n/gs;
1003	$line =~ s/^\n//gs;
1004
1005	# output whatever is left
1006	if ($line =~ /[^\n ]/) {
1007	print OUT $line;
1008	++$output_line_count;
1009	}
1010	}
1011
1012	close OUT;
1013	close IN;
1014
1015	if ($output_line_count) { # try to protect against binary only formats
1016	return 1;
1017	}
1018
1019	&util::rm("$output_filestem.text");
1020	return 0;
1021
1022	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: