Context Navigation

source: trunk/gsdl/bin/script/gsConvert.pl@ 10282

Last change on this file since 10282 was 10282, checked in by chi, 19 years ago
Modifications to allow the gsConvert either run open source coverting program or VB scripting for certain types of document (e.g. Word, PPT...etc)
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 31.3 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use parsargv;
52	use util;
53	use Cwd;
54	use File::Basename;
55
56	# Are we running on WinNT or Win2000 (or later)?
57	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
58	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
59
60	my $use_strings;
61	my $pdf_complex;
62	my $pdf_nohidden;
63	my $pdf_zoom;
64	my $pdf_ignore_images;
65	my $windows_scripting;
66
67	sub print_usage
68	{
69	print STDERR "\n";
70	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
71	print STDERR " or text using third-party programs.\n\n";
72	print STDERR " usage: $0 [options] filename\n";
73	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
74	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
75	print STDERR "\t-output\tauto\|html\|text\|pagedimg-jpg\|pagedimg-gif\|pagedimg-png\t(output file type)\n";
76	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
77	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
78	print STDERR "\t-windows_scripting\tuse windows script when converting Microsoft Word and PPT via VB script\n";
79	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
80	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
81	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
82	print STDERR "\t\tconverting PDF to HTML\n";
83	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
84	print STDERR "\t\t-pdf_complex is set\n";
85	exit(1);
86	}
87
88	my $faillogfile="";
89	my $timeout=0;
90
91	sub main
92	{
93	my (@ARGV) = @_;
94	my ($input_type,$output_type,$verbose);
95
96	# read command-line arguments
97	if (!parsargv::parse(\@ARGV,
98	'type/(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)/', \$input_type,
99	'/errlog/.*/', \$faillogfile,
100	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
101	'timeout/\d+/0',\$timeout,
102	'verbose/\d+/0', \$verbose,
103	'use_strings', \$use_strings,
104	'windows_scripting',\$windows_scripting,
105	'pdf_complex', \$pdf_complex,
106	'pdf_ignore_images', \$pdf_ignore_images,
107	'pdf_nohidden', \$pdf_nohidden,
108	'pdf_zoom/\d+/2', \$pdf_zoom
109	))
110	{
111	print_usage();
112	}
113
114
115	# Make sure the input file exists and can be opened for reading
116	if (scalar(@ARGV!=1)) {
117	print_usage();
118	}
119
120	my $input_filename = $ARGV[0];
121	if (!-r $input_filename) {
122	print STDERR "Error: unable to open $input_filename for reading\n";
123	exit(1);
124	}
125
126	# Deduce filenames
127	my ($tailname,$dirname,$suffix)
128	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
129	my $output_filestem = &util::filename_cat($dirname, "$tailname");
130
131	if ($input_type eq "")
132	{
133	$input_type = lc (substr($suffix,1,length($suffix)-1));
134	}
135
136	# Change to temporary working directory
137	my $stored_dir = cwd();
138	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
139	# Select convert utility
140	if (!defined $input_type) {
141	print STDERR "Error: No filename extension or input type defined\n";
142	exit(1);
143	}
144	elsif ($input_type eq "doc" \|\| $input_type eq "dot") {
145	print &convertDOC($input_filename, $output_filestem, $output_type);
146	print "\n";
147	}
148	elsif ($input_type eq "rtf") {
149	print &convertRTF($input_filename, $output_filestem, $output_type);
150	print "\n";
151	}
152	elsif ($input_type eq "pdf") {
153	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
154	print "\n";
155	}
156	elsif ($input_type eq "ps") {
157	print &convertPS($input_filename, $output_filestem, $output_type);
158	print "\n";
159	}
160	elsif ($input_type eq "ppt") {
161	print &convertPPT($input_filename, $output_filestem, $output_type);
162	print "\n";
163	}
164	elsif ($input_type eq "xls") {
165	print &convertXLS($input_filename, $output_filestem, $output_type);
166	print "\n";
167	}
168	else {
169	print STDERR "Error: Unable to convert type '$input_type'\n";
170	exit(1);
171	}
172
173	# restore to original working directory
174	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
175
176	}
177
178	&main(@ARGV);
179
180
181
182	# Document-type conversion functions
183	#
184	# The following functions attempt to convert documents from their
185	# input type to the specified output type. If no output type was
186	# given, then they first attempt HTML, and then TEXT.
187	#
188	# Each returns the output type ("html" or "text") or "fail" if no
189	# conversion is possible.
190
191	# Convert a Microsoft word document
192
193	sub convertDOC {
194	($input_filename, $output_filestem, $output_type) = @_;
195
196	# Many .doc files are not in fact word documents!
197	my $realtype = &find_docfile_type($input_filename);
198
199	if ($realtype eq "word6" \|\| $realtype eq "word7" \|\| $realtype eq "word8") {
200	return &convertWord678($input_filename, $output_filestem, $output_type);
201	} elsif ($realtype eq "rtf") {
202	return &convertRTF($input_filename, $output_filestem, $output_type);
203	} else {
204	return &convertAnything($input_filename, $output_filestem, $output_type);
205	}
206	}
207
208	# Convert a Microsoft word 6/7/8 document
209
210	sub convertWord678 {
211	($input_filename, $output_filestem, $output_type) = @_;
212
213	my $success = 0;
214	if (!$output_type \|\| ($output_type =~ /html/i)){
215	if ($windows_scripting) {
216	print STDERR "***** Calling VB Script!\n";
217	$success = &native_doc_to_html($input_filename, $output_filestem);
218	}
219	else {
220	print STDERR "**** Calling wvWare\n";
221	$success = &doc_to_html($input_filename, $output_filestem);
222	}
223	if ($success) {
224	return "html";
225	}
226	}
227
228	# Attempt specialised conversion to HTML
229	#if (!$output_type \|\| ($output_type =~ /html/i)) {
230	# $success = &doc_to_html($input_filename, $output_filestem);
231	# if ($success) {
232	# return "html";
233	# }
234	# }
235
236	return &convertAnything($input_filename, $output_filestem, $output_type);
237	}
238
239
240	# Convert a Rich Text Format (RTF) file
241
242	sub convertRTF {
243	($input_filename, $output_filestem, $output_type) = @_;
244
245	my $success = 0;
246
247	# Attempt specialised conversion to HTML
248	if (!$output_type \|\| ($output_type =~ /html/i)) {
249	$success = &rtf_to_html($input_filename, $output_filestem);
250	if ($success) {
251	return "html";
252	}
253	}
254
255	# rtf is so ugly that's it's not worth running strings over.
256	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
257	# return &convertAnything($input_filename, $output_filestem, $output_type);
258	return "fail";
259	}
260
261
262	# Convert an unidentified file
263
264	sub convertAnything {
265	($input_filename, $output_filestem, $output_type) = @_;
266
267	my $success = 0;
268
269	# Attempt simple conversion to HTML
270	if (!$output_type \|\| ($output_type =~ /html/i)) {
271	$success = &any_to_html($input_filename, $output_filestem);
272	if ($success) {
273	return "html";
274	}
275	}
276
277	# Convert to text
278	if (!$output_type \|\| ($output_type =~ /text/i)) {
279	$success = &any_to_text($input_filename, $output_filestem);
280	if ($success) {
281	return "text";
282	}
283	}
284	return "fail";
285	}
286
287
288
289	# Convert an Adobe PDF document
290
291	sub convertPDF {
292	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
293
294	my $success = 0;
295
296	# Attempt conversion to HTML
297	if (!$output_type \|\| ($output_type =~ /html/i)) {
298	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
299	if ($success) {
300	return "html";
301	}
302	}
303
304	# Attempt conversion to TEXT
305	if (!$output_type \|\| ($output_type =~ /text/i)) {
306	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
307	if ($success) {
308	return "text";
309	}
310	}
311
312	return "fail";
313
314	}
315
316
317	# Convert an Adobe PostScript document
318
319	sub convertPS {
320	($input_filename, $output_filestem, $output_type) = @_;
321
322	my $success = 0;
323
324	# Attempt conversion to TEXT
325	if (!$output_type \|\| ($output_type =~ /text/i)) {
326	$success = &ps_to_text($input_filename, $output_filestem);
327	if ($success) {
328	return "text";
329	}
330	}
331	return "fail";
332	}
333
334
335	sub convertPPT {
336	my ($input_filename, $output_filestem, $output_type) = @_;
337
338	my $success = 0;
339	my $ppt_convert_type = "";
340	if (!$output_type \|\| $windows_scripting \|\|($output_type !~ /html/i)){
341	if ($output_type =~ /gif/i) {
342	$ppt_convert_type = "-g";
343	} elsif ($output_type =~ /jp?g/i){
344	$ppt_convert_type = "-j";
345	} elsif ($output_type =~ /png/i){
346	$ppt_convert_type = "-p";
347	}
348	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
349	$ENV{'GSDLOS'}, "pptextract");
350	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ /^windows$/i);
351
352	$cmd = "";
353	#if ($timeout) {$cmd = "ulimit -t $timeout;";}
354	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
355	#$cmd .= "$vbScript $input_filename $output_filestem.html";
356	# if the converting directory has already existed
357	if (-d $output_filestem) {
358	print STDERR "**The conversion directory has existed\n";
359	return "item";
360	} else {
361	$cmd .= "$vbScript $ppt_convert_type $input_filename $output_filestem";
362	$cmd .= " 2>\"$output_filestem.err\""
363	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
364	if (system($cmd) !=0) {
365	print STDERR "Powerpoint VB Scripting convert failed\n";
366	} else {
367	return "item";
368	}
369	}
370	} else {
371	# Attempt conversion to HTML
372	#if (!$output_type \|\| ($output_type =~ /html/i)) {
373	# formulate the command
374	$cmd = "";
375	$cmd .= "perl -S ppttohtml.pl ";
376	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
377	$cmd .= " 2>\"$output_filestem.err\""
378	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
379
380	# execute the command
381	$!=0;
382	if (system($cmd)!=0)
383	{
384	print STDERR "Powerpoint 95/97 converter failed $!\n";
385	} else {
386	return "html";
387	}
388	}
389
390	$success = &any_to_text($input_filename, $output_filestem);
391	if ($success) {
392	return "text";
393	}
394
395	return "fail";
396	}
397
398
399	sub convertXLS {
400	my ($input_filename, $output_filestem, $output_type) = @_;
401
402	my $success = 0;
403
404	# Attempt conversion to HTML
405	if (!$output_type \|\| ($output_type =~ /html/i)) {
406	# formulate the command
407	$cmd = "";
408	$cmd .= "perl -S xlstohtml.pl ";
409	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
410	$cmd .= " 2>\"$output_filestem.err\""
411	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
412
413
414	# execute the command
415	$!=0;
416	if (system($cmd)!=0)
417	{
418	print STDERR "Excel 95/97 converter failed $!\n";
419	} else {
420	return "html";
421	}
422	}
423
424	$success = &any_to_text($input_filename, $output_filestem);
425	if ($success) {
426	return "text";
427	}
428
429	return "fail";
430	}
431
432
433
434	# Find the real type of a .doc file
435	#
436	# We seem to have a lot of files with a .doc extension that are .rtf
437	# files or Word 5 files. This function attempts to tell the difference.
438	sub find_docfile_type {
439	($input_filename) = @_;
440
441	open(CHK, "<$input_filename");
442	binmode(CHK);
443	my $line = "";
444	my $first = 1;
445
446	while (<CHK>) {
447
448	$line = $_;
449
450	if ($first) {
451	# check to see if this is an rtf file
452	if ($line =~ /^\{\\rtf/) {
453	close(CHK);
454	return "rtf";
455	}
456	$first = 0;
457	}
458
459	# is this is a word 6/7/8 document?
460	if ($line =~ /Word\.Document\.([678])/) {
461	close(CHK);
462	return "word$1";
463	}
464
465	}
466
467	return "unknown";
468	}
469
470
471	# Specific type-to-type conversions
472	#
473	# Each of the following functions attempts to convert a document from
474	# a specific format to another. If they succeed they return 1 and leave
475	# the output document(s) in the appropriate place; if they fail they
476	# return 0 and delete any working files.
477
478
479	# Attempt to convert a word document to html with the wv program
480	sub doc_to_html {
481	($input_filename, $output_filestem) = @_;
482
483	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
484	$ENV{'GSDLOS'}, "wvWare");
485
486	# don't include path on windows (to avoid having to play about
487	# with quoting when GSDLHOME might contain spaces) but assume
488	# that the PATH is set up correctly
489	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
490
491	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
492	"packages", "wv", "wvHtml.xml");
493
494	my $cmd = "";
495	if ($timeout) {$cmd = "ulimit -t $timeout;";}
496	$cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
497	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
498
499	# redirecting STDERR is a bad idea on windows 95/98
500	$cmd .= " 2> \"$output_filestem.err\""
501	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
502
503	# execute the command
504	$!=0;
505	if (system($cmd)!=0)
506	{
507	print STDERR "Error executing wv converter:$!\n";
508	if (-s "$output_filestem.err") {
509	open (ERRFILE, "<$output_filestem.err");
510
511	my $write_to_fail_log=0;
512	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
513	{$write_to_fail_log=1;}
514
515	my $line;
516	while ($line=<ERRFILE>) {
517	if ($line =~ /\w/) {
518	print STDERR "$line";
519	print FAILLOG "$line" if ($write_to_fail_log);
520	}
521	if ($line !~ m/startup error/) {next;}
522	print STDERR " (given an invalid .DOC file?)\n";
523	print FAILLOG " (given an invalid .DOC file?)\n"
524	if ($write_to_fail_log);
525
526	} # while ERRFILE
527	close FAILLOG if ($write_to_fail_log);
528	}
529	return 0; # we can try any_to_text
530	}
531
532	# Was the conversion successful?
533
534	if (-s "$output_filestem.html") {
535	open(TMP, "$output_filestem.html");
536	$line = <TMP>;
537	close(TMP);
538	if ($line && $line =~ /DOCTYPE HTML/) {
539	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
540	return 1;
541	}
542	}
543
544	# If here, an error of some sort occurred
545	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
546	if (-e "$output_filestem.err") {
547	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
548	open (ERRLOG,"$output_filestem.err");
549	while (<ERRLOG>) {print FAILLOG $_;}
550	close FAILLOG;
551	close ERRLOG;
552	}
553	&util::rm("$output_filestem.err");
554	}
555
556	return 0;
557	}
558
559	# Attempt to convert a word document to html with the word2html scripting program
560	sub native_doc_to_html {
561	($input_filename, $output_filestem) = @_;
562
563	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
564	$ENV{'GSDLOS'}, "word2html");
565
566	$vbScript = "word2html" if ($ENV{'GSDLOS'} =~ /^windows$/i);
567
568	my $cmd = "";
569	if ($timeout) {$cmd = "ulimit -t $timeout;";}
570	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
571	$cmd .= "$vbScript $input_filename $output_filestem.html";
572
573	# redirecting STDERR
574	$cmd .= " 2> \"$output_filestem.err\""
575	if ($ENV {'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
576
577	# execute the command
578	$!=0;
579	if (system($cmd)!=0)
580	{
581	print STDERR "Error executing word2Html converter:$!\n";
582	if (-s "$output_filestem.err") {
583	open (ERRFILE, "<$output_filestem.err");
584
585	my $write_to_fail_log=0;
586	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
587	{$write_to_fail_log=1;}
588
589	my $line;
590	while ($line=<ERRFILE>) {
591	if ($line =~ /\w/) {
592	print STDERR "$line";
593	print FAILLOG "$line" if ($write_to_fail_log);
594	}
595	if ($line !~ m/startup error/) {next;}
596	print STDERR " (given an invalid .DOC file?)\n";
597	print FAILLOG " (given an invalid .DOC file?)\n"
598	if ($write_to_fail_log);
599
600	} # while ERRFILE
601	close FAILLOG if ($write_to_fail_log);
602	}
603	return 0; # we can try any_to_text
604	}
605
606	# Was the conversion successful?
607	if (-s "$output_filestem.html") {
608	open(TMP, "$output_filestem.html");
609	$line = <TMP>;
610	close(TMP);
611	if ($line && $line =~ /html/) {
612	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
613	return 1;
614	}
615	}
616
617	# If here, an error of some sort occurred
618	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
619	if (-e "$output_filestem.err") {
620	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
621	open (ERRLOG,"$output_filestem.err");
622	while (<ERRLOG>) {print FAILLOG $_;}
623	close FAILLOG;
624	close ERRLOG;
625	}
626	&util::rm("$output_filestem.err");
627	}
628	return 0;
629	}
630
631
632
633	# Attempt to convert an RTF document to html with rtftohtml
634
635	sub rtf_to_html {
636	my ($input_filename, $output_filestem) = @_;
637
638	# formulate the command
639	$cmd = "";
640	if ($timeout) {$cmd = "ulimit -t $timeout;";}
641	$cmd .= "rtftohtml";
642	#$cmd .= "rtf-converter";
643
644	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
645
646	$cmd .= " 2>\"$output_filestem.err\""
647	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
648
649
650	# execute the command
651	$!=0;
652	if (system($cmd)!=0)
653	{
654	print STDERR "Error executing rtf converter $!\n";
655	# don't currently bother printing out error log...
656	# keep going, in case it still created an HTML file...
657	}
658
659	# Was the conversion successful?
660	my $was_successful=0;
661	if (-s "$output_filestem.html") {
662	# make sure we have some content other than header
663	open (HTML, "$output_filestem.html"); # what to do if fail?
664	my $line;
665	my $past_header=0;
666	while ($line=<HTML>) {
667
668	if ($past_header == 0) {
669	if ($line =~ /<body>/) {$past_header=1;}
670	next;
671	}
672
673	$line =~ s/<[^>]+>//g;
674	if ($line =~ /\w/ && $past_header) { # we found some content...
675	$was_successful=1;
676	last;
677	}
678	}
679	close HTML;
680	}
681
682	if ($was_successful) {
683	&util::rm("$output_filestem.err")
684	if (-e "$output_filestem.err");
685	# insert the (modified) table of contents, if it exists.
686	if (-e "${output_filestem}_ToC.html") {
687	&util::mv("$output_filestem.html","$output_filestem.src");
688	my $open_failed=0;
689	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
690	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
691	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
692
693	if ($open_failed) {
694	close HTMLSRC;
695	close TOC;
696	close HTML;
697	&util::mv("$output_filestem.src","$output_filestem.html");
698	return 1;
699	}
700
701	# print out header info from src html.
702	while (defined($_ = <HTMLSRC>) && $_ =~ /\w/) {
703	print HTML "$_";
704	}
705
706	# print out table of contents, making links relative
707	<TOC>; <TOC>; # ignore first 2 lines
708	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
709	my $line;
710	while ($line=<TOC>) {
711	$line =~ s@</body></html>$@@ ; # only last line has this
712	# make link relative
713	$line =~ s@href=\"[^\#]+@href=\"@;
714	print HTML $line;
715	}
716	close TOC;
717
718	# rest of html src
719	while (<HTMLSRC>) {
720	print HTML $_;
721	}
722	close HTMLSRC;
723	close HTML;
724
725	&util::rm("${output_filestem}_ToC.html");
726	&util::rm("${output_filestem}.src");
727	}
728	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
729	return 1; # success
730	}
731
732	if (-e "$output_filestem.err") {
733	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
734	{
735	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
736	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
737	print FAILLOG " (rtf file might be too recent):\n";
738	open (ERRLOG, "$output_filestem.err");
739	while (<ERRLOG>) {print FAILLOG $_;}
740	close ERRLOG;
741	close FAILLOG;
742	}
743	&util::rm("$output_filestem.err");
744	}
745
746	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
747
748	return 0;
749	}
750
751
752	# Convert a pdf file to html with the pdftohtml command
753
754	sub pdf_to_html {
755	my ($dirname, $input_filename, $output_filestem) = @_;
756
757	$cmd = "";
758	if ($timeout) {$cmd = "ulimit -t $timeout;";}
759	$cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
760	$cmd .= " -c" if ($pdf_complex);
761	$cmd .= " -i" if ($pdf_ignore_images);
762	$cmd .= " -hidden" unless ($pdf_nohidden);
763	$cmd .= " \"$input_filename\" \"$output_filestem\"";
764
765	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000) {
766	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
767	} else {
768	$cmd .= " > \"$output_filestem.err\"";
769	}
770
771	$!=0;
772
773	my $retval=system($cmd);
774	if ($retval!=0)
775	{
776	print STDERR "Error executing pdftohtml.pl";
777	if ($!) {print STDERR ": $!";}
778	print STDERR "\n";
779	}
780
781	# make sure the converter made something
782	if ($retval!=0 \|\| ! -s "$output_filestem.html")
783	{
784	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
785	# print out the converter's std err, if any
786	if (-s "$output_filestem.err") {
787	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
788	print STDERR "pdftohtml error log:\n";
789	while (<ERRLOG>) {
790	print STDERR "$_";
791	}
792	close ERRLOG;
793	}
794	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
795	if (-e "$output_filestem.err") {
796	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
797	{
798	open (ERRLOG, "$output_filestem.err");
799	while (<ERRLOG>) {print FAILLOG $_;}
800	close ERRLOG;
801	close FAILLOG;
802	}
803	&util::rm("$output_filestem.err");
804	}
805	return 0;
806	}
807
808	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
809	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
810	return 1;
811	}
812
813	# Convert a PDF file to text with the pdftotext command
814
815	sub pdf_to_text {
816	my ($dirname, $input_filename, $output_filestem) = @_;
817
818	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
819
820	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
821	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
822	} else {
823	$cmd .= " > \"$output_filestem.err\"";
824	}
825
826	if (system($cmd)!=0)
827	{
828	print STDERR "Error executing $cmd: $!\n";
829	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
830	}
831
832	# make sure there is some extracted text.
833	if (-e "$output_filestem.text") {
834	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
835	binmode(EXTR_TEXT); # just in case...
836	my $line="";
837	my $seen_text=0;
838	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
839	if ($line=~ /\w/) {$seen_text=1;}
840	}
841	close EXTR_TEXT;
842	if ($seen_text==0) { # no text was extracted
843	print STDERR "Error: pdftotext found no text\n";
844	&util::rm("$output_filestem.text");
845	}
846	}
847
848	# make sure the converter made something
849	if (! -s "$output_filestem.text")
850	{
851	# print out the converters std err, if any
852	if (-s "$output_filestem.err") {
853	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
854	print STDERR "pdftotext error log:\n";
855	while (<ERRLOG>) {
856	print STDERR "$_";
857	}
858	close ERRLOG;
859	}
860	# does this converter create a .out file?
861	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
862	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
863	if (-e "$output_filestem.err") {
864	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
865	{
866	open (ERRLOG,"$output_filestem.err");
867	while (<ERRLOG>) {print FAILLOG $_;}
868	close ERRLOG;
869	close FAILLOG;
870	}
871	&util::rm("$output_filestem.err");
872	}
873	return 0;
874	}
875	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
876	return 1;
877	}
878
879	# Convert a PostScript document to text
880	# note - just using "ps2ascii" isn't good enough, as it
881	# returns 0 for a postscript interpreter error. ps2ascii is just
882	# a wrapper to "gs" anyway, so we use that cmd here.
883
884	sub ps_to_text {
885	my ($input_filename, $output_filestem) = @_;
886
887	my $error = "";
888
889	# if we're on windows we'll fall straight through without attempting
890	# to use gs
891	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
892	$error = "Windows does not support gs";
893
894	} else {
895	my $cmd = "";
896	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
897	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
898	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
899	$cmd .= " 2> $output_filestem.err";
900	$!=0;
901	my $retcode=system($cmd);
902	$retcode = $? >> 8; # see man perlfunc - system for this...
903	# if system returns -1 \| 127 (couldn't start program), look at $! for message
904
905	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
906	elsif (! -e "$output_filestem.text") {
907	$error="did not create output file.\n";
908	}
909	else
910	{ # make sure the interpreter didn't get an error. It is technically
911	# possible for the actual text to start with this, but....
912	open PSOUT, "$output_filestem.text";
913	if (<PSOUT> =~ /^Error: (.*)/) {
914	$error="interpreter error - \"$1\"";
915	}
916	close PSOUT;
917	}
918	}
919
920	if ($error ne "")
921	{
922	print STDERR "Warning: Error executing gs: $error\n";
923	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
924
925	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
926	{
927	print FAILLOG "gs - $error\n";
928	if (-e "$output_filestem.err") {
929	open(ERRLOG, "$output_filestem.err");
930	while (<ERRLOG>) {print FAILLOG $_;}
931	close ERRLOG;
932	}
933	close FAILLOG;
934	}
935	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
936
937
938	# Fine then. We'll just do a lousy job by ourselves...
939	# Based on 5-line regexp sed script found at:
940	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
941	#
942	print STDERR "Stripping text from postscript\n";
943	my $errorcode=0;
944	open (IN, "$input_filename")
945	\|\| ($errorcode=1, warn "Couldn't read file: $!");
946	open (OUT, ">$output_filestem.text")
947	\|\| ($errorcode=1, warn "Couldn't write file: $!");
948	if ($errorcode) {print STDERR "errors\n";return 0;}
949
950	my $text=""; # this is for whole .ps file...
951	$text = join('', <IN>); # see man perlport, under "System Resources"
952	close IN;
953
954	# Make sure this is a ps file...
955	if ($text !~ /^%!/) {
956	print STDERR "Bad postscript header: not '%!'\n";
957	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
958	{
959	print FAILLOG "Bad postscript header: not '%!'\n";
960	close FAILLOG;
961	}
962	return 0;
963	}
964
965	# if ps has Page data, then use it to delete all stuff before it.
966	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
967
968	# remove all leading non-data stuff
969	$text =~ s/^.*?\(//s;
970
971	# remove all newline chars for easier processing
972	$text =~ s/\n//g;
973
974	# Big assumption here - assume that if any co-ordinates are
975	# given, then we are at the end of a sentence.
976	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
977
978	# special characters--
979	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
980
981	# ? ps text formatting (eg italics?) ?
982	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
983	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
984	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
985	# default - remove the rest
986	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
987
988	# attempt to add whitespace between words...
989	# this is based purely on observation, and may be completely wrong...
990	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
991	# eg I notice "b(" is sometimes NOT a space if preceded by a
992	# negative number.
993	$text =~ s/\)\d+ ?b\(/\) \( /g;
994
995	# change quoted braces to brackets
996	$text =~ s/([^\\])\\\(/$1\{/g;
997	$text =~ s/([^\\])\\\)/$1\}/g ;
998
999	# remove everything that is not between braces
1000	$text =~ s/\)([^\(\)])+?\(//sg ;
1001
1002	# remove any Trailer eof stuff.
1003	$text =~ s/\)[^\)]*$//sg;
1004
1005	### ligatures have special characters...
1006	$text =~ s/\\013/ff/g;
1007	$text =~ s/\\014/fi/g;
1008	$text =~ s/\\015/fl/g;
1009	$text =~ s/\\016/ffi/g;
1010	$text =~ s/\\214/fi/g;
1011	$text =~ s/\\215/fl/g;
1012	$text =~ s/\\017/\n\* /g; # asterisk?
1013	$text =~ s/\\023/\023/g; # e acute ('e)
1014	$text =~ s/\\177/\252/g; # u"
1015	# $text =~ s/ ?? /\344/g; # a"
1016
1017	print OUT "$text";
1018	close OUT;
1019	}
1020	# wrap the text - use a minimum length. ie, first space after this length.
1021	my $wrap_length=72;
1022	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1023	open INFILE, "$output_filestem.text.tmp" \|\|
1024	die "Couldn't open file: $!";
1025	open OUTFILE, ">$output_filestem.text" \|\|
1026	die "Couldn't open file for writing: $!";
1027	my $line="";
1028	while ($line=<INFILE>) {
1029	while (length($line)>0) {
1030	if (length($line)>$wrap_length) {
1031	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1032	print OUTFILE "$1\n";
1033	} else {
1034	print OUTFILE "$line";
1035	$line="";
1036	}
1037	}
1038	}
1039	close INFILE;
1040	close OUTFILE;
1041	&util::rm("$output_filestem.text.tmp");
1042
1043	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1044	return 1;
1045	}
1046
1047
1048	# Convert any file to HTML with a crude perl implementation of the
1049	# UNIX strings command.
1050
1051	sub any_to_html {
1052	($input_filename, $output_filestem) = @_;
1053
1054	# First generate a text file
1055	return 0 unless (&any_to_text($input_filename, $output_filestem));
1056
1057	# create an HTML file from the text file
1058	open(TEXT, "<$output_filestem.text");
1059	open(HTML, ">$output_filestem.html");
1060
1061	print HTML "<html><head>\n";
1062	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1063	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1064	print HTML "</head><body>\n\n";
1065
1066	my $line;
1067	while ($line=<TEXT>) {
1068	$line =~ s/</</g;
1069	$line =~ s/>/>/g;
1070	if ($line =~ /^\s*$/) {
1071	print HTML "<p>";
1072	} else {
1073	print HTML "<br> ", $line;
1074	}
1075	}
1076	print HTML "\n</body></html>\n";
1077
1078	close HTML;
1079	close TEXT;
1080
1081	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1082	return 1;
1083	}
1084
1085	# Convert any file to TEXT with a crude perl implementation of the
1086	# UNIX strings command.
1087	# Note - this assumes ascii charsets :( (jrm21)
1088
1089	sub any_to_text {
1090	($input_filename, $output_filestem) = @_;
1091
1092	if (!$use_strings) {
1093	return 0;
1094	}
1095
1096	open(IN, "<$input_filename") \|\| return 0;
1097	binmode(IN);
1098	open(OUT, ">$output_filestem.text") \|\| return 0;
1099
1100	my ($line);
1101	my $output_line_count = 0;
1102	while (<IN>) {
1103	$line = $_;
1104
1105	# delete anything that isn't a printable character
1106	$line =~ s/[^\040-\176]+/\n/sg;
1107
1108	# delete any string less than 10 characters long
1109	$line =~ s/^.{0,9}$/\n/mg;
1110	while ($line =~ /^.{1,9}$/m) {
1111	$line =~ s/^.{0,9}$/\n/mg;
1112	$line =~ s/\n+/\n/sg;
1113	}
1114
1115	# remove extraneous whitespace
1116	$line =~ s/\n+/\n/gs;
1117	$line =~ s/^\n//gs;
1118
1119	# output whatever is left
1120	if ($line =~ /[^\n ]/) {
1121	print OUT $line;
1122	++$output_line_count;
1123	}
1124	}
1125
1126	close OUT;
1127	close IN;
1128
1129	if ($output_line_count) { # try to protect against binary only formats
1130	return 1;
1131	}
1132
1133	&util::rm("$output_filestem.text");
1134	return 0;
1135
1136	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: