Context Navigation

source: tags/gsdl-2_37-distribution/gsdl/bin/script/gsConvert.pl@ 2843

Last change on this file since 2843 was 2755, checked in by jrm21, 23 years ago
import.pl now takes an option for saving file conversion failures to a log. By default, import.pl will use <collectdir>/etc/fail.log. Currently only the plugins based on ConvertToPlug will do this. Not yet tested on Win9X.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 24.6 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert Microsoft Word, RTF, Adobe PDF and PostScript
35	# using specialised conversion utilities. We can try to convert any file to
36	# text with a perl implementation of the UNIX strings command.
37	#
38	# We try to convert Postscript files to text using "gs" which is often on
39	# *nix machines. We fall back to performing weak text extraction by using
40	# regular expressions.
41
42	BEGIN {
43	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
44	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
45	}
46
47	use parsargv;
48	use util;
49	use Cwd;
50	use File::Basename;
51
52	# Are we running on WinNT or Win2000 (or later)?
53	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
54	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
55
56	sub print_usage
57	{
58	print STDERR "\n";
59	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
60	print STDERR " or text using third-party programs.\n\n";
61	print STDERR " usage: $0 [options] filename\n";
62	print STDERR " options:\n\t-type\tdoc\|pdf\|ps\|rtf\t(input file type)\n";
63	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
64	print STDERR "\t-output\thtml\|text\n";
65	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
66	exit(1);
67	}
68
69	my $faillogfile="";
70
71	sub main
72	{
73	my (@ARGV) = @_;
74	my ($input_type,$output_type,$verbose,$timeout);
75
76	$timeout = 0;
77	# read command-line arguments
78	if (!parsargv::parse(\@ARGV,
79	'type/(doc\|pdf\|ps\|rtf)/', \$input_type,
80	'/errlog/.*/', \$faillogfile,
81	'output/(html\|text)/', \$output_type,
82	'timeout/\d+/0',\$timeout,
83	'verbose/\d+/0', \$verbose))
84	{
85	print_usage();
86	}
87
88	# Make sure the input file exists and can be opened for reading
89	if (scalar(@ARGV!=1)) {
90	print_usage();
91	}
92
93	my $input_filename = $ARGV[0];
94	if (!-r $input_filename) {
95	print STDERR "Error: unable to open $input_filename for reading\n";
96	exit(1);
97	}
98
99	# Deduce filenames
100	my ($tailname,$dirname,$suffix)
101	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
102	my $output_filestem = &util::filename_cat($dirname, "$tailname");
103
104	if ($input_type eq "")
105	{
106	$input_type = lc (substr($suffix,1,length($suffix)-1));
107	}
108
109	# Change to temporary working directory
110	my $stored_dir = cwd();
111	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
112
113	# Select convert utility
114	if (!defined $input_type) {
115	print STDERR "Error: No filename extension or input type defined\n";
116	exit(1);
117	}
118	elsif ($input_type eq "doc") {
119	print &convertDOC($input_filename, $output_filestem, $output_type);
120	print "\n";
121	}
122	elsif ($input_type eq "rtf") {
123	print &convertRTF($input_filename, $output_filestem, $output_type);
124	print "\n";
125	}
126	elsif ($input_type eq "pdf") {
127	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
128	print "\n";
129	}
130	elsif ($input_type eq "ps") {
131	print &convertPS($input_filename, $output_filestem, $output_type);
132	print "\n";
133	}
134	else {
135	print STDERR "Error: Unable to convert type '$input_type'\n";
136	exit(1);
137	}
138
139	# restore to original working directory
140	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
141
142	}
143
144	&main(@ARGV);
145
146
147
148	# Document-type conversion functions
149	#
150	# The following functions attempt to convert documents from their
151	# input type to the specified output type. If no output type was
152	# given, then they first attempt HTML, and then TEXT.
153	#
154	# Each returns the output type ("html" or "text") or "fail" if no
155	# conversion is possible.
156
157	# Convert a Microsoft word document
158
159	sub convertDOC {
160	($input_filename, $output_filestem, $output_type) = @_;
161
162	# Many .doc files are not in fact word documents!
163	my $realtype = &find_docfile_type($input_filename);
164
165	if ($realtype eq "word6" \|\| $realtype eq "word7" \|\| $realtype eq "word8") {
166	return &convertWord678($input_filename, $output_filestem, $output_type);
167	} elsif ($realtype eq "rtf") {
168	return &convertRTF($input_filename, $output_filestem, $output_type);
169	} else {
170	return &convertAnything($input_filename, $output_filestem, $output_type);
171	}
172	}
173
174	# Convert a Microsoft word 6/7/8 document
175
176	sub convertWord678 {
177	($input_filename, $output_filestem, $output_type) = @_;
178
179	my $success = 0;
180
181	# Attempt specialised conversion to HTML
182	if (!$output_type \|\| ($output_type =~ /html/i)) {
183	$success = &doc_to_html($input_filename, $output_filestem);
184	if ($success) {
185	return "html";
186	}
187	}
188
189	return &convertAnything($input_filename, $output_filestem, $output_type);
190	}
191
192
193	# Convert a Rich Text Format (RTF) file
194
195	sub convertRTF {
196	($input_filename, $output_filestem, $output_type) = @_;
197
198	my $success = 0;
199
200	# Attempt specialised conversion to HTML
201	if (!$output_type \|\| ($output_type =~ /html/i)) {
202	$success = &rtf_to_html($input_filename, $output_filestem);
203	if ($success) {
204	return "html";
205	}
206	}
207
208	# rtf is so ugly that's it's not worth running strings over.
209	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
210	# return &convertAnything($input_filename, $output_filestem, $output_type);
211	return "fail";
212	}
213
214
215	# Convert an unidentified file
216
217	sub convertAnything {
218	($input_filename, $output_filestem, $output_type) = @_;
219
220	my $success = 0;
221
222	# Attempt simple conversion to HTML
223	if (!$output_type \|\| ($output_type =~ /html/i)) {
224	$success = &any_to_html($input_filename, $output_filestem);
225	if ($success) {
226	return "html";
227	}
228	}
229
230	# Convert to text
231	if (!$output_type \|\| ($output_type =~ /text/i)) {
232	$success = &any_to_text($input_filename, $output_filestem);
233	if ($success) {
234	return "text";
235	}
236	}
237	return "fail";
238	}
239
240
241
242	# Convert an Adobe PDF document
243
244	sub convertPDF {
245	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
246
247	my $success = 0;
248
249	# Attempt conversion to HTML
250	if (!$output_type \|\| ($output_type =~ /html/i)) {
251	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
252	if ($success) {
253	return "html";
254	}
255	}
256
257	# Attempt conversion to TEXT
258	if (!$output_type \|\| ($output_type =~ /text/i)) {
259	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
260	if ($success) {
261	return "text";
262	}
263	}
264
265	return "fail";
266
267	}
268
269
270	# Convert an Adobe PostScript document
271
272	sub convertPS {
273	($input_filename, $output_filestem, $output_type) = @_;
274
275	my $success = 0;
276
277	# Attempt conversion to TEXT
278	if (!$output_type \|\| ($output_type =~ /text/i)) {
279	$success = &ps_to_text($input_filename, $output_filestem);
280	if ($success) {
281	return "text";
282	}
283	}
284
285	return "fail";
286
287	}
288
289
290	# Find the real type of a .doc file
291	#
292	# We seem to have a lot of files with a .doc extension that are .rtf
293	# files or Word 5 files. This function attempts to tell the difference.
294
295	sub find_docfile_type {
296	($input_filename) = @_;
297
298	open(CHK, "<$input_filename");
299	binmode(CHK);
300	my $line = "";
301	my $first = 1;
302
303	while (<CHK>) {
304
305	$line = $_;
306
307	if ($first) {
308	# check to see if this is an rtf file
309	if ($line =~ /^\{\\rtf/) {
310	close(CHK);
311	return "rtf";
312	}
313	$first = 0;
314	}
315
316	# is this is a word 6/7/8 document?
317	if ($line =~ /Word\.Document\.([678])/) {
318	close(CHK);
319	return "word$1";
320	}
321
322	}
323
324	return "unknown";
325	}
326
327
328
329	# Specific type-to-type conversions
330	#
331	# Each of the following functions attempts to convert a document from
332	# a specific format to another. If they succeed they return 1 and leave
333	# the output document(s) in the appropriate place; if they fail they
334	# return 0 and delete any working files.
335
336
337	# Attempt to convert a word document to html with the wv program
338
339	sub doc_to_html {
340	($input_filename, $output_filestem) = @_;
341
342	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
343	$ENV{'GSDLOS'}, "wvWare");
344
345	# don't include path on windows (to avoid having to play about
346	# with quoting when GSDLHOME might contain spaces) but assume
347	# that the PATH is set up correctly
348	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
349
350	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
351	"packages", "wv", "wvHtml.xml");
352
353	my $cmd = "";
354	if ($timeout) {$cmd = "ulimit -t $timeout;";}
355	$cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
356	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
357
358	# redirecting STDERR is a bad idea on windows 95/98
359	$cmd .= " 2> \"$output_filestem.err\""
360	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
361
362	# execute the command
363	$!=0;
364	if (system($cmd)!=0)
365	{
366	print STDERR "Error executing wv converter:$!\n";
367	if (-s "$output_filestem.err") {
368	open (ERRFILE, "<$output_filestem.err");
369
370	my $write_to_fail_log=0;
371	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
372	{$write_to_fail_log=1;}
373
374	my $line;
375	while ($line=<ERRFILE>) {
376	if ($line =~ /\w/) {
377	print STDERR "$line";
378	print FAILLOG "$line" if ($write_to_fail_log);
379	}
380	if ($line !~ m/startup error/) {next;}
381	print STDERR " (given an invalid .DOC file?)\n";
382	print FAILLOG " (given an invalid .DOC file?)\n"
383	if ($write_to_fail_log);
384
385	} # while ERRFILE
386	close FAILLOG if ($write_to_fail_log);
387	}
388	print STDERR "Continuing...\n";
389	return 0; # we can try any_to_text
390	}
391
392	# Was the conversion successful?
393
394	if (-s "$output_filestem.html") {
395	open(TMP, "$output_filestem.html");
396	$line = <TMP>;
397	close(TMP);
398	if ($line && $line =~ /DOCTYPE HTML/) {
399	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
400	return 1;
401	}
402	}
403
404	# If here, an error of some sort occurred
405	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
406	if (-e "$output_filestem.err") {
407	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
408	open (ERRLOG,"$output_filestem.err");
409	while (<ERRLOG>) {print FAILLOG $_;}
410	close FAILLOG;
411	close ERRLOG;
412	}
413	&util::rm("$output_filestem.err");
414	}
415
416	return 0;
417	}
418
419
420	# Attempt to convert an RTF document to html with rtftohtml
421
422	sub rtf_to_html {
423	my ($input_filename, $output_filestem) = @_;
424
425	# formulate the command
426	$cmd = "";
427	if ($timeout) {$cmd = "ulimit -t $timeout;";}
428	$cmd .= "rtftohtml";
429
430	# it automatically uses $output_filestem.html
431	$cmd .= " \"$input_filename\"";
432
433	$cmd .= " 2>\"$output_filestem.err\""
434	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
435
436
437	# execute the command
438	$!=0;
439	if (system($cmd)!=0)
440	{
441	print STDERR "Error executing rtf converter $!\n";
442	# don't currently bother printing out error log...
443	# keep going, in case it still created an HTML file...
444	}
445
446	# Was the conversion successful?
447	my $was_successful=0;
448	if (-s "$output_filestem.html") {
449	# make sure we have some content other than header
450	open (HTML, "$output_filestem.html"); # what to do if fail?
451	my $line;
452	my $past_header=0;
453	while ($line=<HTML>) {
454
455	if ($past_header == 0) {
456	if ($line =~ /<body>/) {$past_header=1;}
457	next;
458	}
459
460	$line =~ s/<[^>]+>//g;
461	if ($line =~ /\w/ && $past_header) { # we found some content...
462	$was_successful=1;
463	last;
464	}
465	}
466	close HTML;
467	}
468
469	if ($was_successful) {
470	&util::rm("$output_filestem.err")
471	if (-e "$output_filestem.err");
472	# insert the (modified) table of contents, if it exists.
473	if (-e "${output_filestem}_ToC.html") {
474	&util::mv("$output_filestem.html","$output_filestem.src");
475	my $open_failed=0;
476	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
477	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
478	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
479
480	if ($open_failed) {
481	close HTMLSRC;
482	close TOC;
483	close HTML;
484	&util::mv("$output_filestem.src","$output_filestem.html");
485	return 1;
486	}
487
488	# print out header info from src html.
489	while (($_ = <HTMLSRC>) =~ /\w/) {
490	print HTML "$_";
491	}
492
493	# print out table of contents, making links relative
494	<TOC>; <TOC>; # ignore first 2 lines
495	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
496	my $line;
497	while ($line=<TOC>) {
498	$line =~ s@</body></html>$@@ ; # only last line has this
499	# make link relative
500	$line =~ s@href=\"[^\#]+@href=\"@;
501	print HTML $line;
502	}
503	close TOC;
504
505	# rest of html src
506	while (<HTMLSRC>) {
507	print HTML $_;
508	}
509	close HTMLSRC;
510	close HTML;
511
512	&util::rm("${output_filestem}_ToC.html");
513	&util::rm("${output_filestem}.src");
514	}
515	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
516	return 1; # success
517	}
518
519	if (-e "$output_filestem.err") {
520	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
521	{
522	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
523	print FAILLOG " (rtf file might be too recent):\n";
524	open (ERRLOG, "$output_filestem.err");
525	while (<ERRLOG>) {print FAILLOG $_;}
526	close ERRLOG;
527	close FAILLOG;
528	}
529	&util::rm("$output_filestem.err");
530	}
531
532	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
533
534	return 0;
535	}
536
537
538	# Convert a pdf file to html with the pdftohtml command
539
540	sub pdf_to_html {
541	my ($dirname, $input_filename, $output_filestem) = @_;
542
543	$cmd = "";
544	if ($timeout) {$cmd = "ulimit -t $timeout;";}
545	$cmd .= "perl -S pdftohtml.pl ";
546	$cmd .= " \"$input_filename\" \"$output_filestem\"";
547
548	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000) {
549	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
550	} else {
551	$cmd .= " > \"$output_filestem.err\"";
552	}
553
554	$!=0;
555
556	my $retval=system($cmd);
557	if ($retval!=0)
558	{
559	print STDERR "Error executing pdftohtml.pl";
560	if ($!) {print STDERR ": $!";}
561	print STDERR "\n";
562	}
563
564	# make sure the converter made something
565	if ($retval!=0 \|\| ! -s "$output_filestem.html")
566	{
567	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
568	# print out the converter's std err, if any
569	if (-s "$output_filestem.err") {
570	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
571	print STDERR "pdftohtml error log:\n";
572	while (<ERRLOG>) {
573	print STDERR "$_";
574	}
575	close ERRLOG;
576	}
577	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
578	if (-e "$output_filestem.err") {
579	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
580	{
581	open (ERRLOG, "$output_filestem.err");
582	while (<ERRLOG>) {print FAILLOG $_;}
583	close ERRLOG;
584	close FAILLOG;
585	}
586	&util::rm("$output_filestem.err");
587	}
588	return 0;
589	}
590
591	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
592	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
593	return 1;
594	}
595
596	# Convert a PDF file to text with the pdftotext command
597
598	sub pdf_to_text {
599	my ($dirname, $input_filename, $output_filestem) = @_;
600
601	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
602
603	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
604	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
605	} else {
606	$cmd .= " > \"$output_filestem.err\"";
607	}
608
609	if (system($cmd)!=0)
610	{
611	print STDERR "Error executing $cmd: $!\n";
612	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
613	}
614
615	# make sure there is some extracted text.
616	if (-e "$output_filestem.text") {
617	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
618	binmode(EXTR_TEXT); # just in case...
619	my $line="";
620	my $seen_text=0;
621	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
622	if ($line=~ /\w/) {$seen_text=1;}
623	}
624	close EXTR_TEXT;
625	if ($seen_text==0) { # no text was extracted
626	print STDERR "Error: pdftotext found no text\n";
627	&util::rm("$output_filestem.text");
628	}
629	}
630
631	# make sure the converter made something
632	if (! -s "$output_filestem.text")
633	{
634	# print out the converters std err, if any
635	if (-s "$output_filestem.err") {
636	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
637	print STDERR "pdftotext error log:\n";
638	while (<ERRLOG>) {
639	print STDERR "$_";
640	}
641	close ERRLOG;
642	}
643	# does this converter create a .out file?
644	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
645	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
646	if (-e "$output_filestem.err") {
647	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
648	{
649	open (ERRLOG,"$output_filestem.err");
650	while (<ERRLOG>) {print FAILLOG $_;}
651	close ERRLOG;
652	close FAILLOG;
653	}
654	&util::rm("$output_filestem.err");
655	}
656	return 0;
657	}
658	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
659	return 1;
660	}
661
662	# Convert a PostScript document to text
663	# note - just using "ps2ascii" isn't good enough, as it
664	# returns 0 for a postscript interpreter error. ps2ascii is just
665	# a wrapper to "gs" anyway, so we use that cmd here.
666
667	sub ps_to_text {
668	my ($input_filename, $output_filestem) = @_;
669
670	my $error = "";
671
672	# if we're on windows we'll fall straight through without attempting
673	# to use gs
674	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
675	$error = "Windows does not support gs";
676
677	} else {
678	my $cmd = "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
679	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
680	$cmd .= " 2> $output_filestem.err";
681	$!=0;
682
683	my $retcode=system($cmd);
684	$retcode = $? >> 8; # see man perlfunc - system for this...
685	# if system returns -1 \| 127 (couldn't start program), look at $! for message
686
687	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
688	elsif (! -e "$output_filestem.text") {
689	$error="did not create output file.\n";
690	}
691	else
692	{ # make sure the interpreter didn't get an error. It is technically
693	# possible for the actual text to start with this, but....
694	open PSOUT, "$output_filestem.text";
695	if (<PSOUT> =~ /^Error: (.*)/) {
696	$error="interpreter error - \"$1\"";
697	}
698	close PSOUT;
699	}
700	}
701
702	if ($error ne "")
703	{
704	print STDERR "Warning: Error executing gs: $error\n";
705	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
706
707	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
708	{
709	print FAILLOG "gs - $error\n";
710	if (-e "$output_filestem.err") {
711	open(ERRLOG, "$output_filestem.err");
712	while (<ERRLOG>) {print FAILLOG $_;}
713	close ERRLOG;
714	}
715	close FAILLOG;
716	}
717	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
718
719
720	# Fine then. We'll just do a lousy job by ourselves...
721	# Based on 5-line regexp sed script found at:
722	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
723	#
724	print STDERR "Stripping text from postscript\n";
725	my $errorcode=0;
726	open (IN, "$input_filename")
727	\|\| ($errorcode=1, warn "Couldn't read file: $!");
728	open (OUT, ">$output_filestem.text")
729	\|\| ($errorcode=1, warn "Couldn't write file: $!");
730	if ($errorcode) {print STDERR "errors\n";return 0;}
731
732	my $text=""; # this is for whole .ps file...
733	$text = join('', <IN>); # see man perlport, under "System Resources"
734	close IN;
735
736	# Make sure this is a ps file...
737	if ($text !~ /^%!/) {
738	print STDERR "Bad postscript header: not '%!'\n";
739	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
740	{
741	print FAILLOG "Bad postscript header: not '%!'\n";
742	close FAILLOG;
743	}
744	return 0;
745	}
746
747	# if ps has Page data, then use it to delete all stuff before it.
748	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
749
750	# remove all leading non-data stuff
751	$text =~ s/^.*?\(//s;
752
753	# remove all newline chars for easier processing
754	$text =~ s/\n//g;
755
756	# Big assumption here - assume that if any co-ordinates are
757	# given, then we are at the end of a sentence.
758	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
759
760	# special characters--
761	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
762
763	# ? ps text formatting (eg italics?) ?
764	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
765	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
766	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
767	# default - remove the rest
768	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
769
770	# attempt to add whitespace between words...
771	# this is based purely on observation, and may be completely wrong...
772	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
773	# eg I notice "b(" is sometimes NOT a space if preceded by a
774	# negative number.
775	$text =~ s/\)\d+ ?b\(/\) \( /g;
776
777	# change quoted braces to brackets
778	$text =~ s/([^\\])\\\(/$1\{/g;
779	$text =~ s/([^\\])\\\)/$1\}/g ;
780
781	# remove everything that is not between braces
782	$text =~ s/\)([^\(\)])+?\(//sg ;
783
784	# remove any Trailer eof stuff.
785	$text =~ s/\)[^\)]*$//sg;
786
787	### ligatures have special characters...
788	$text =~ s/\\013/ff/g;
789	$text =~ s/\\014/fi/g;
790	$text =~ s/\\015/fl/g;
791	$text =~ s/\\016/ffi/g;
792	$text =~ s/\\214/fi/g;
793	$text =~ s/\\215/fl/g;
794	$text =~ s/\\017/\n\* /g; # asterisk?
795	$text =~ s/\\023/\023/g; # e acute ('e)
796	$text =~ s/\\177/\252/g; # u"
797	# $text =~ s/ ?? /\344/g; # a"
798
799	print OUT "$text";
800	close OUT;
801	}
802	# wrap the text - use a minimum length. ie, first space after this length.
803	my $wrap_length=72;
804	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
805	open INFILE, "$output_filestem.text.tmp" \|\|
806	die "Couldn't open file: $!";
807	open OUTFILE, ">$output_filestem.text" \|\|
808	die "Couldn't open file for writing: $!";
809	my $line="";
810	while ($line=<INFILE>) {
811	while (length($line)>0) {
812	if (length($line)>$wrap_length) {
813	$line =~ s/^(.{$wrap_length}[^\s])\s//;
814	print OUTFILE "$1\n";
815	} else {
816	print OUTFILE "$line";
817	$line="";
818	}
819	}
820	}
821	close INFILE;
822	close OUTFILE;
823	&util::rm("$output_filestem.text.tmp");
824
825	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
826	return 1;
827	}
828
829
830	# Convert any file to HTML with a crude perl implementation of the
831	# UNIX strings command.
832
833	sub any_to_html {
834	($input_filename, $output_filestem) = @_;
835
836	# First generate a text file
837	return 0 unless (&any_to_text($input_filename, $output_filestem));
838
839	# create an HTML file from the text file
840	open(TEXT, "<$output_filestem.text");
841	open(HTML, ">$output_filestem.html");
842
843	print HTML "<html><head>\n";
844	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
845	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
846	print HTML "</head><body>\n\n";
847
848	my $line;
849	while ($line=<TEXT>) {
850	$line =~ s/</</g;
851	$line =~ s/>/>/g;
852	if ($line =~ /^\s*$/) {
853	print HTML "<p>";
854	} else {
855	print HTML "<br> ", $line;
856	}
857	}
858	print HTML "\n</body></html>\n";
859
860	close HTML;
861	close TEXT;
862
863	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
864	return 1;
865	}
866
867	# Convert any file to TEXT with a crude perl implementation of the
868	# UNIX strings command.
869	# Note - this assumes ascii charsets :( (jrm21)
870
871	sub any_to_text {
872	($input_filename, $output_filestem) = @_;
873
874	open(IN, "<$input_filename") \|\| return 0;
875	binmode(IN);
876	open(OUT, ">$output_filestem.text") \|\| return 0;
877
878	my ($line);
879	my $output_line_count = 0;
880	while (<IN>) {
881	$line = $_;
882
883	# delete anything that isn't a printable character
884	$line =~ s/[^\040-\176]+/\n/sg;
885
886	# delete any string less than 10 characters long
887	$line =~ s/^.{0,9}$/\n/mg;
888	while ($line =~ /^.{1,9}$/m) {
889	$line =~ s/^.{0,9}$/\n/mg;
890	$line =~ s/\n+/\n/sg;
891	}
892
893	# remove extraneous whitespace
894	$line =~ s/\n+/\n/gs;
895	$line =~ s/^\n//gs;
896
897	# output whatever is left
898	if ($line =~ /[^\n ]/) {
899	print OUT $line;
900	++$output_line_count;
901	}
902	}
903
904	close OUT;
905	close IN;
906
907	if ($output_line_count) { # try to protect against binary only formats
908	return 1;
909	}
910
911	&util::rm("$output_filestem.text");
912	return 0;
913
914	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: