Context Navigation

source: gsdl/trunk/bin/script/gsConvert.pl@ 14959

Last change on this file since 14959 was 12704, checked in by davidb, 18 years ago
convert RTF upgraded so it can also use windows scripting option.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 36.7 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use parsargv;
52	use util;
53	use Cwd;
54	use File::Basename;
55
56	# Are we running on WinNT or Win2000 (or later)?
57	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
58	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
59
60	my $use_strings;
61	my $pdf_complex;
62	my $pdf_nohidden;
63	my $pdf_zoom;
64	my $pdf_ignore_images;
65	my $pdf_allow_images_only;
66	my $windows_scripting;
67
68	sub print_usage
69	{
70	print STDERR "\n";
71	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
72	print STDERR " or text using third-party programs.\n\n";
73	print STDERR " usage: $0 [options] filename\n";
74	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
75	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
76	print STDERR "\t-output\tauto\|html\|text\|pagedimg-jpg\|pagedimg-gif\|pagedimg-png\t(output file type)\n";
77	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
78	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
79	print STDERR "\t-windows_scripting\tuse windows script when converting Microsoft Word and PPT via VB script\n";
80	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
81	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
82	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
83	print STDERR "\t\tconverting PDF to HTML\n";
84	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
85	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
86	print STDERR "\t\t-pdf_complex is set\n";
87	exit(1);
88	}
89
90	my $faillogfile="";
91	my $timeout=0;
92
93	sub main
94	{
95	my (@ARGV) = @_;
96	my ($input_type,$output_type,$verbose);
97
98	# read command-line arguments
99	if (!parsargv::parse(\@ARGV,
100	'type/(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)/', \$input_type,
101	'/errlog/.*/', \$faillogfile,
102	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
103	'timeout/\d+/0',\$timeout,
104	'verbose/\d+/0', \$verbose,
105	'use_strings', \$use_strings,
106	'windows_scripting',\$windows_scripting,
107	'pdf_complex', \$pdf_complex,
108	'pdf_ignore_images', \$pdf_ignore_images,
109	'pdf_allow_images_only', \$pdf_allow_images_only,
110	'pdf_nohidden', \$pdf_nohidden,
111	'pdf_zoom/\d+/2', \$pdf_zoom
112	))
113	{
114	print_usage();
115	}
116
117	# Make sure the input file exists and can be opened for reading
118	if (scalar(@ARGV!=1)) {
119	print_usage();
120	}
121
122	my $input_filename = $ARGV[0];
123	if (!-r $input_filename) {
124	print STDERR "Error: unable to open $input_filename for reading\n";
125	exit(1);
126	}
127
128	# Deduce filenames
129	my ($tailname,$dirname,$suffix)
130	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
131	my $output_filestem = &util::filename_cat($dirname, "$tailname");
132
133	if ($input_type eq "")
134	{
135	$input_type = lc (substr($suffix,1,length($suffix)-1));
136	}
137
138	# Change to temporary working directory
139	my $stored_dir = cwd();
140	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
141
142	# Select convert utility
143	if (!defined $input_type) {
144	print STDERR "Error: No filename extension or input type defined\n";
145	exit(1);
146	}
147	elsif ($input_type eq "doc" \|\| $input_type eq "dot") {
148	print &convertDOC($input_filename, $output_filestem, $output_type);
149	print "\n";
150	}
151	elsif ($input_type eq "rtf") {
152	print &convertRTF($input_filename, $output_filestem, $output_type);
153	print "\n";
154	}
155	elsif ($input_type eq "pdf") {
156	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
157	print "\n";
158	}
159	elsif ($input_type eq "ps") {
160	print &convertPS($input_filename, $output_filestem, $output_type);
161	print "\n";
162	}
163	elsif ($input_type eq "ppt") {
164	print &convertPPT($input_filename, $output_filestem, $output_type);
165	print "\n";
166	}
167	elsif ($input_type eq "xls") {
168	print &convertXLS($input_filename, $output_filestem, $output_type);
169	print "\n";
170	}
171	else {
172	print STDERR "Error: Unable to convert type '$input_type'\n";
173	exit(1);
174	}
175
176	# restore to original working directory
177	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
178
179	}
180
181	&main(@ARGV);
182
183
184
185	# Document-type conversion functions
186	#
187	# The following functions attempt to convert documents from their
188	# input type to the specified output type. If no output type was
189	# given, then they first attempt HTML, and then TEXT.
190	#
191	# Each returns the output type ("html" or "text") or "fail" if no
192	# conversion is possible.
193
194	# Convert a Microsoft word document
195
196	sub convertDOC {
197	($input_filename, $output_filestem, $output_type) = @_;
198
199	# Many .doc files are not in fact word documents!
200	my $realtype = &find_docfile_type($input_filename);
201
202	if ($realtype eq "word6" \|\| $realtype eq "word7" \|\| $realtype eq "word8") {
203	return &convertWord678($input_filename, $output_filestem, $output_type);
204	} elsif ($realtype eq "rtf") {
205	return &convertRTF($input_filename, $output_filestem, $output_type);
206	} else {
207	return &convertAnything($input_filename, $output_filestem, $output_type);
208	}
209	}
210
211	# Convert a Microsoft word 6/7/8 document
212
213	sub convertWord678 {
214	($input_filename, $output_filestem, $output_type) = @_;
215
216	my $success = 0;
217	if (!$output_type \|\| ($output_type =~ /html/i)){
218	if ($windows_scripting) {
219	$success = &native_doc_to_html($input_filename, $output_filestem);
220	}
221	else {
222	$success = &doc_to_html($input_filename, $output_filestem);
223	}
224	if ($success) {
225	return "html";
226	}
227	}
228
229	return &convertAnything($input_filename, $output_filestem, $output_type);
230	}
231
232
233	# Convert a Rich Text Format (RTF) file
234
235	sub convertRTF {
236	($input_filename, $output_filestem, $output_type) = @_;
237
238	my $success = 0;
239
240	# Attempt specialised conversion to HTML
241	if (!$output_type \|\| ($output_type =~ /html/i)) {
242
243	if ($windows_scripting) {
244	$success = &native_doc_to_html($input_filename, $output_filestem);
245	}
246	else {
247	$success = &rtf_to_html($input_filename, $output_filestem);
248	}
249	if ($success) {
250	return "html";
251	}
252	}
253
254	# rtf is so ugly that's it's not worth running strings over.
255	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
256	# return &convertAnything($input_filename, $output_filestem, $output_type);
257	return "fail";
258	}
259
260
261	# Convert an unidentified file
262
263	sub convertAnything {
264	($input_filename, $output_filestem, $output_type) = @_;
265
266	my $success = 0;
267
268	# Attempt simple conversion to HTML
269	if (!$output_type \|\| ($output_type =~ /html/i)) {
270	$success = &any_to_html($input_filename, $output_filestem);
271	if ($success) {
272	return "html";
273	}
274	}
275
276	# Convert to text
277	if (!$output_type \|\| ($output_type =~ /text/i)) {
278	$success = &any_to_text($input_filename, $output_filestem);
279	if ($success) {
280	return "text";
281	}
282	}
283	return "fail";
284	}
285
286
287
288	# Convert an Adobe PDF document
289
290	sub convertPDF {
291	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
292
293	my $success = 0;
294	$output_type =~ s/.\-(.)/$1/i;
295	# Attempt coversion to Image
296	if ($output_type =~ /jp?g\|gif\|png/i) {
297	$success = &pdf_to_img($dirname, $input_filename, $output_filestem, $output_type);
298	if ($success){
299	return "item";
300	}
301	}
302
303	# Attempt conversion to HTML
304	if (!$output_type \|\| ($output_type =~ /html/i)) {
305	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
306	if ($success) {
307	return "html";
308	}
309	}
310
311	# Attempt conversion to TEXT
312	if (!$output_type \|\| ($output_type =~ /text/i)) {
313	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
314	if ($success) {
315	return "text";
316	}
317	}
318
319	return "fail";
320
321	}
322
323
324	# Convert an Adobe PostScript document
325
326	sub convertPS {
327	($input_filename, $output_filestem, $output_type) = @_;
328
329	my $success = 0;
330	$output_type =~ s/.\-(.)/$1/i;
331	# Attempt coversion to Image
332	if ($output_type =~ /jp?g\|gif\|png/i) {
333	$success = &ps_to_img($dirname, $input_filename, $output_filestem, $output_type);
334	if ($success){
335	return "item";
336	}
337	}
338
339	# Attempt conversion to TEXT
340	if (!$output_type \|\| ($output_type =~ /text/i)) {
341	$success = &ps_to_text($input_filename, $output_filestem);
342	if ($success) {
343	return "text";
344	}
345	}
346	return "fail";
347	}
348
349
350	sub convertPPT {
351	my ($input_filename, $output_filestem, $output_type) = @_;
352	my $success = 0;
353
354	my $ppt_convert_type = "";
355	#if (!$output_type \|\| $windows_scripting \|\|($output_type !~ /html/i) \|\|($output_type !~ /text/i)){
356	if ($windows_scripting && ($output_type !~ /html/i) && ($output_type !~ /text/i)){
357	if ($output_type =~ /gif/i) {
358	$ppt_convert_type = "-g";
359	} elsif ($output_type =~ /jp?g/i){
360	$ppt_convert_type = "-j";
361	} elsif ($output_type =~ /png/i){
362	$ppt_convert_type = "-p";
363	}
364	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
365	$ENV{'GSDLOS'}, "pptextract");
366	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ /^windows$/i);
367
368	$cmd = "";
369	if ($timeout) {$cmd = "ulimit -t $timeout;";}
370	# if the converting directory has already existed
371	if (-d $output_filestem) {
372	print STDERR "**The conversion directory has existed\n";
373	return "item";
374	} else {
375	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
376	$cmd .= " 2>\"$output_filestem.err\""
377	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
378	if (system($cmd) !=0) {
379	print STDERR "Powerpoint VB Scripting convert failed\n";
380	} else {
381	return "item";
382	}
383	}
384	} elsif (!$output_type \|\| ($output_type =~ /html/i)) {
385	# Attempt conversion to HTML
386	#if (!$output_type \|\| ($output_type =~ /html/i)) {
387	# formulate the command
388	$cmd = "";
389	$cmd .= "perl -S ppttohtml.pl ";
390	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
391	$cmd .= " 2>\"$output_filestem.err\""
392	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
393
394	# execute the command
395	$!=0;
396	if (system($cmd)!=0)
397	{
398	print STDERR "Powerpoint 95/97 converter failed $!\n";
399	} else {
400	return "html";
401	}
402	}
403
404	$success = &any_to_text($input_filename, $output_filestem);
405	if ($success) {
406	return "text";
407	}
408
409	return "fail";
410	}
411
412
413	sub convertXLS {
414	my ($input_filename, $output_filestem, $output_type) = @_;
415
416	my $success = 0;
417
418	# Attempt conversion to HTML
419	if (!$output_type \|\| ($output_type =~ /html/i)) {
420	# formulate the command
421	$cmd = "";
422	$cmd .= "perl -S xlstohtml.pl ";
423	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
424	$cmd .= " 2>\"$output_filestem.err\""
425	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
426
427
428	# execute the command
429	$!=0;
430	if (system($cmd)!=0)
431	{
432	print STDERR "Excel 95/97 converter failed $!\n";
433	} else {
434	return "html";
435	}
436	}
437
438	$success = &any_to_text($input_filename, $output_filestem);
439	if ($success) {
440	return "text";
441	}
442
443	return "fail";
444	}
445
446
447
448	# Find the real type of a .doc file
449	#
450	# We seem to have a lot of files with a .doc extension that are .rtf
451	# files or Word 5 files. This function attempts to tell the difference.
452	sub find_docfile_type {
453	($input_filename) = @_;
454
455	open(CHK, "<$input_filename");
456	binmode(CHK);
457	my $line = "";
458	my $first = 1;
459
460	while (<CHK>) {
461
462	$line = $_;
463
464	if ($first) {
465	# check to see if this is an rtf file
466	if ($line =~ /^\{\\rtf/) {
467	close(CHK);
468	return "rtf";
469	}
470	$first = 0;
471	}
472
473	# is this is a word 6/7/8 document?
474	if ($line =~ /Word\.Document\.([678])/) {
475	close(CHK);
476	return "word$1";
477	}
478
479	}
480
481	return "unknown";
482	}
483
484
485	# Specific type-to-type conversions
486	#
487	# Each of the following functions attempts to convert a document from
488	# a specific format to another. If they succeed they return 1 and leave
489	# the output document(s) in the appropriate place; if they fail they
490	# return 0 and delete any working files.
491
492
493	# Attempt to convert a word document to html with the wv program
494	sub doc_to_html {
495	($input_filename, $output_filestem) = @_;
496
497	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
498	$ENV{'GSDLOS'}, "wvWare");
499
500	# don't include path on windows (to avoid having to play about
501	# with quoting when GSDLHOME might contain spaces) but assume
502	# that the PATH is set up correctly
503	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
504
505	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
506	"packages", "wv", "wvHtml.xml");
507
508	my $cmd = "";
509	if ($timeout) {$cmd = "ulimit -t $timeout;";}
510	$cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
511	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
512
513	# redirecting STDERR is a bad idea on windows 95/98
514	$cmd .= " 2> \"$output_filestem.err\""
515	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
516	# execute the command
517	$!=0;
518	if (system($cmd)!=0)
519	{
520	print STDERR "Error executing wv converter:$!\n";
521	if (-s "$output_filestem.err") {
522	open (ERRFILE, "<$output_filestem.err");
523
524	my $write_to_fail_log=0;
525	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
526	{$write_to_fail_log=1;}
527
528	my $line;
529	while ($line=<ERRFILE>) {
530	if ($line =~ /\w/) {
531	print STDERR "$line";
532	print FAILLOG "$line" if ($write_to_fail_log);
533	}
534	if ($line !~ m/startup error/) {next;}
535	print STDERR " (given an invalid .DOC file?)\n";
536	print FAILLOG " (given an invalid .DOC file?)\n"
537	if ($write_to_fail_log);
538
539	} # while ERRFILE
540	close FAILLOG if ($write_to_fail_log);
541	}
542	return 0; # we can try any_to_text
543	}
544
545	# Was the conversion successful?
546
547	if (-s "$output_filestem.html") {
548	open(TMP, "$output_filestem.html");
549	$line = <TMP>;
550	close(TMP);
551	if ($line && $line =~ /DOCTYPE HTML/) {
552	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
553	return 1;
554	}
555	}
556
557	# If here, an error of some sort occurred
558	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
559	if (-e "$output_filestem.err") {
560	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
561	open (ERRLOG,"$output_filestem.err");
562	while (<ERRLOG>) {print FAILLOG $_;}
563	close FAILLOG;
564	close ERRLOG;
565	}
566	&util::rm("$output_filestem.err");
567	}
568
569	return 0;
570	}
571
572
573	# Attempt to convert a word document to html with the word2html scripting program
574	sub native_doc_to_html {
575	($input_filename, $output_filestem) = @_;
576
577	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
578	$ENV{'GSDLOS'}, "word2html");
579
580	$vbScript = "word2html" if ($ENV{'GSDLOS'} =~ /^windows$/i);
581	if (-e "$output_filestem.html") {
582	print STDERR "*** The conversion file has existed\n";
583	return 1;
584	}
585
586	my $cmd = "";
587	if ($timeout) {$cmd = "ulimit -t $timeout;";}
588	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
589	#$cmd .= "$vbScript $input_filename $output_filestem.html";
590	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
591
592	# redirecting STDERR
593	$cmd .= " 2> \"$output_filestem.err\""
594	if ($ENV {'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
595
596	# execute the command
597	$!=0;
598	if (system($cmd)!=0)
599	{
600	print STDERR "Error executing word2Html converter:$!\n";
601	if (-s "$output_filestem.err") {
602	open (ERRFILE, "<$output_filestem.err");
603
604	my $write_to_fail_log=0;
605	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
606	{$write_to_fail_log=1;}
607
608	my $line;
609	while ($line=<ERRFILE>) {
610	if ($line =~ /\w/) {
611	print STDERR "$line";
612	print FAILLOG "$line" if ($write_to_fail_log);
613	}
614	if ($line !~ m/startup error/) {next;}
615	print STDERR " (given an invalid .DOC file?)\n";
616	print FAILLOG " (given an invalid .DOC file?)\n"
617	if ($write_to_fail_log);
618
619	} # while ERRFILE
620	close FAILLOG if ($write_to_fail_log);
621	}
622	return 0; # we can try any_to_text
623	}
624
625	# Was the conversion successful?
626	if (-s "$output_filestem.html") {
627	open(TMP, "$output_filestem.html");
628	$line = <TMP>;
629	close(TMP);
630	if ($line && $line =~ /html/) {
631	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
632	return 1;
633	}
634	}
635
636	# If here, an error of some sort occurred
637	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
638	if (-e "$output_filestem.err") {
639	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
640	open (ERRLOG,"$output_filestem.err");
641	while (<ERRLOG>) {print FAILLOG $_;}
642	close FAILLOG;
643	close ERRLOG;
644	}
645	&util::rm("$output_filestem.err");
646	}
647	return 0;
648	}
649
650	# Attempt to convert an RTF document to html with rtftohtml
651
652	sub rtf_to_html {
653	my ($input_filename, $output_filestem) = @_;
654
655	# formulate the command
656	$cmd = "";
657	if ($timeout) {$cmd = "ulimit -t $timeout;";}
658	$cmd .= "rtftohtml";
659	#$cmd .= "rtf-converter";
660
661	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
662
663	$cmd .= " 2>\"$output_filestem.err\""
664	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
665
666
667	# execute the command
668	$!=0;
669	if (system($cmd)!=0)
670	{
671	print STDERR "Error executing rtf converter $!\n";
672	# don't currently bother printing out error log...
673	# keep going, in case it still created an HTML file...
674	}
675
676	# Was the conversion successful?
677	my $was_successful=0;
678	if (-s "$output_filestem.html") {
679	# make sure we have some content other than header
680	open (HTML, "$output_filestem.html"); # what to do if fail?
681	my $line;
682	my $past_header=0;
683	while ($line=<HTML>) {
684
685	if ($past_header == 0) {
686	if ($line =~ /<body>/) {$past_header=1;}
687	next;
688	}
689
690	$line =~ s/<[^>]+>//g;
691	if ($line =~ /\w/ && $past_header) { # we found some content...
692	$was_successful=1;
693	last;
694	}
695	}
696	close HTML;
697	}
698
699	if ($was_successful) {
700	&util::rm("$output_filestem.err")
701	if (-e "$output_filestem.err");
702	# insert the (modified) table of contents, if it exists.
703	if (-e "${output_filestem}_ToC.html") {
704	&util::mv("$output_filestem.html","$output_filestem.src");
705	my $open_failed=0;
706	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
707	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
708	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
709
710	if ($open_failed) {
711	close HTMLSRC;
712	close TOC;
713	close HTML;
714	&util::mv("$output_filestem.src","$output_filestem.html");
715	return 1;
716	}
717
718	# print out header info from src html.
719	while (defined($_ = <HTMLSRC>) && $_ =~ /\w/) {
720	print HTML "$_";
721	}
722
723	# print out table of contents, making links relative
724	<TOC>; <TOC>; # ignore first 2 lines
725	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
726	my $line;
727	while ($line=<TOC>) {
728	$line =~ s@</body></html>$@@ ; # only last line has this
729	# make link relative
730	$line =~ s@href=\"[^\#]+@href=\"@;
731	print HTML $line;
732	}
733	close TOC;
734
735	# rest of html src
736	while (<HTMLSRC>) {
737	print HTML $_;
738	}
739	close HTMLSRC;
740	close HTML;
741
742	&util::rm("${output_filestem}_ToC.html");
743	&util::rm("${output_filestem}.src");
744	}
745	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
746	return 1; # success
747	}
748
749	if (-e "$output_filestem.err") {
750	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
751	{
752	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
753	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
754	print FAILLOG " (rtf file might be too recent):\n";
755	open (ERRLOG, "$output_filestem.err");
756	while (<ERRLOG>) {print FAILLOG $_;}
757	close ERRLOG;
758	close FAILLOG;
759	}
760	&util::rm("$output_filestem.err");
761	}
762
763	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
764
765	return 0;
766	}
767
768
769	# Convert a pdf file to html with the pdftohtml command
770
771	sub pdf_to_html {
772	my ($dirname, $input_filename, $output_filestem) = @_;
773
774	$cmd = "";
775	if ($timeout) {$cmd = "ulimit -t $timeout;";}
776	$cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
777	$cmd .= " -c" if ($pdf_complex);
778	$cmd .= " -i" if ($pdf_ignore_images);
779	$cmd .= " -a" if ($pdf_allow_images_only);
780	$cmd .= " -hidden" unless ($pdf_nohidden);
781	$cmd .= " \"$input_filename\" \"$output_filestem\"";
782
783	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000) {
784	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
785	} else {
786	$cmd .= " > \"$output_filestem.err\"";
787	}
788
789	$!=0;
790
791	my $retval=system($cmd);
792	if ($retval!=0)
793	{
794	print STDERR "Error executing pdftohtml.pl";
795	if ($!) {print STDERR ": $!";}
796	print STDERR "\n";
797	}
798
799	# make sure the converter made something
800	if ($retval!=0 \|\| ! -s "$output_filestem.html")
801	{
802	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
803	# print out the converter's std err, if any
804	if (-s "$output_filestem.err") {
805	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
806	print STDERR "pdftohtml error log:\n";
807	while (<ERRLOG>) {
808	print STDERR "$_";
809	}
810	close ERRLOG;
811	}
812	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
813	if (-e "$output_filestem.err") {
814	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
815	{
816	open (ERRLOG, "$output_filestem.err");
817	while (<ERRLOG>) {print FAILLOG $_;}
818	close ERRLOG;
819	close FAILLOG;
820	}
821	&util::rm("$output_filestem.err");
822	}
823	return 0;
824	}
825
826	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
827	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
828	return 1;
829	}
830
831	# Convert a pdf file to various types of image with the convert command
832
833	sub pdf_to_img {
834	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
835
836	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
837	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
838	my $result = `identify 2>&1`;
839	if ($? == -1 \|\| $? == 256) { # Linux and Windows return different values for "program not found"
840	#ImageMagick is not installed, thus the convert utility is not available.
841	print STDERR "*** ImageMagick is not installed, the convert utility is not available\n";
842	return 0;
843	}
844	}
845
846	$cmd = "";
847	if ($timeout) {$cmd = "ulimit -t $timeout;";}
848	$output_type =~ s/.\_(.)/$1/i;
849	$cmd .= "perl -S pdftoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
850	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000) {
851	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
852	} else {
853	$cmd .= " > \"$output_filestem.err\"";
854	}
855
856	# don't include path on windows (to avoid having to play about
857	# with quoting when GSDLHOME might contain spaces) but assume
858	# that the PATH is set up correctly
859	$!=0;
860	my $retval=system($cmd);
861	if ($retval!=0)
862	{
863	print STDERR "Error executing pdftoimg.pl";
864	if ($!) {print STDERR ": $!";}
865	print STDERR "\n";
866	}
867
868	#make sure the converter made something
869	#if ($retval !=0) \|\| ! -s "$output_filestem")
870	if ($retval !=0)
871	{
872	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
873	#print out the converter's std err, if any
874	if (-s "$output_filestem.err") {
875	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
876	print STDERR "pdftoimg error log:\n";
877	while (<ERRLOG>) {
878	print STDERR "$_";
879	}
880	close ERRLOG;
881	}
882	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
883	if (-e "$output_filestem.err") {
884	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
885	{
886	open (ERRLOG, "$output_filestem.err");
887	while (<ERRLOG>) {print FAILLOG $_;}
888	close ERRLOG;
889	close FAILLOG;
890	}
891	&util::rm("$output_filestem.err");
892	}
893	return 0;
894	}
895	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
896	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
897	return 1;
898	}
899
900	# Convert a PDF file to text with the pdftotext command
901
902	sub pdf_to_text {
903	my ($dirname, $input_filename, $output_filestem) = @_;
904
905	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
906
907	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
908	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
909	} else {
910	$cmd .= " > \"$output_filestem.err\"";
911	}
912
913	if (system($cmd)!=0)
914	{
915	print STDERR "Error executing $cmd: $!\n";
916	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
917	}
918
919	# make sure there is some extracted text.
920	if (-e "$output_filestem.text") {
921	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
922	binmode(EXTR_TEXT); # just in case...
923	my $line="";
924	my $seen_text=0;
925	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
926	if ($line=~ /\w/) {$seen_text=1;}
927	}
928	close EXTR_TEXT;
929	if ($seen_text==0) { # no text was extracted
930	print STDERR "Error: pdftotext found no text\n";
931	&util::rm("$output_filestem.text");
932	}
933	}
934
935	# make sure the converter made something
936	if (! -s "$output_filestem.text")
937	{
938	# print out the converters std err, if any
939	if (-s "$output_filestem.err") {
940	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
941	print STDERR "pdftotext error log:\n";
942	while (<ERRLOG>) {
943	print STDERR "$_";
944	}
945	close ERRLOG;
946	}
947	# does this converter create a .out file?
948	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
949	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
950	if (-e "$output_filestem.err") {
951	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
952	{
953	open (ERRLOG,"$output_filestem.err");
954	while (<ERRLOG>) {print FAILLOG $_;}
955	close ERRLOG;
956	close FAILLOG;
957	}
958	&util::rm("$output_filestem.err");
959	}
960	return 0;
961	}
962	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
963	return 1;
964	}
965
966	# Convert a PostScript document to text
967	# note - just using "ps2ascii" isn't good enough, as it
968	# returns 0 for a postscript interpreter error. ps2ascii is just
969	# a wrapper to "gs" anyway, so we use that cmd here.
970
971	sub ps_to_text {
972	my ($input_filename, $output_filestem) = @_;
973
974	my $error = "";
975
976	# if we're on windows we'll fall straight through without attempting
977	# to use gs
978	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
979	$error = "Windows does not support gs";
980
981	} else {
982	my $cmd = "";
983	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
984	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
985	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
986	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
987	$cmd .= " 2> $output_filestem.err";
988	$!=0;
989
990	my $retcode=system($cmd);
991	$retcode = $? >> 8; # see man perlfunc - system for this...
992	# if system returns -1 \| 127 (couldn't start program), look at $! for message
993
994	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
995	elsif (! -e "$output_filestem.text") {
996	$error="did not create output file.\n";
997	}
998	else
999	{ # make sure the interpreter didn't get an error. It is technically
1000	# possible for the actual text to start with this, but....
1001	open PSOUT, "$output_filestem.text";
1002	if (<PSOUT> =~ /^Error: (.*)/) {
1003	$error="interpreter error - \"$1\"";
1004	}
1005	close PSOUT;
1006	}
1007	}
1008
1009	if ($error ne "")
1010	{
1011	print STDERR "Warning: Error executing gs: $error\n";
1012	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1013
1014	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1015	{
1016	print FAILLOG "gs - $error\n";
1017	if (-e "$output_filestem.err") {
1018	open(ERRLOG, "$output_filestem.err");
1019	while (<ERRLOG>) {print FAILLOG $_;}
1020	close ERRLOG;
1021	}
1022	close FAILLOG;
1023	}
1024	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1025
1026
1027	# Fine then. We'll just do a lousy job by ourselves...
1028	# Based on 5-line regexp sed script found at:
1029	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1030	#
1031	print STDERR "Stripping text from postscript\n";
1032	my $errorcode=0;
1033	open (IN, "$input_filename")
1034	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1035	open (OUT, ">$output_filestem.text")
1036	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1037	if ($errorcode) {print STDERR "errors\n";return 0;}
1038
1039	my $text=""; # this is for whole .ps file...
1040	$text = join('', <IN>); # see man perlport, under "System Resources"
1041	close IN;
1042
1043	# Make sure this is a ps file...
1044	if ($text !~ /^%!/) {
1045	print STDERR "Bad postscript header: not '%!'\n";
1046	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1047	{
1048	print FAILLOG "Bad postscript header: not '%!'\n";
1049	close FAILLOG;
1050	}
1051	return 0;
1052	}
1053
1054	# if ps has Page data, then use it to delete all stuff before it.
1055	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1056
1057	# remove all leading non-data stuff
1058	$text =~ s/^.*?\(//s;
1059
1060	# remove all newline chars for easier processing
1061	$text =~ s/\n//g;
1062
1063	# Big assumption here - assume that if any co-ordinates are
1064	# given, then we are at the end of a sentence.
1065	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1066
1067	# special characters--
1068	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1069
1070	# ? ps text formatting (eg italics?) ?
1071	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1072	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1073	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1074	# default - remove the rest
1075	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1076
1077	# attempt to add whitespace between words...
1078	# this is based purely on observation, and may be completely wrong...
1079	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1080	# eg I notice "b(" is sometimes NOT a space if preceded by a
1081	# negative number.
1082	$text =~ s/\)\d+ ?b\(/\) \( /g;
1083
1084	# change quoted braces to brackets
1085	$text =~ s/([^\\])\\\(/$1\{/g;
1086	$text =~ s/([^\\])\\\)/$1\}/g ;
1087
1088	# remove everything that is not between braces
1089	$text =~ s/\)([^\(\)])+?\(//sg ;
1090
1091	# remove any Trailer eof stuff.
1092	$text =~ s/\)[^\)]*$//sg;
1093
1094	### ligatures have special characters...
1095	$text =~ s/\\013/ff/g;
1096	$text =~ s/\\014/fi/g;
1097	$text =~ s/\\015/fl/g;
1098	$text =~ s/\\016/ffi/g;
1099	$text =~ s/\\214/fi/g;
1100	$text =~ s/\\215/fl/g;
1101	$text =~ s/\\017/\n\* /g; # asterisk?
1102	$text =~ s/\\023/\023/g; # e acute ('e)
1103	$text =~ s/\\177/\252/g; # u"
1104	# $text =~ s/ ?? /\344/g; # a"
1105
1106	print OUT "$text";
1107	close OUT;
1108	}
1109	# wrap the text - use a minimum length. ie, first space after this length.
1110	my $wrap_length=72;
1111	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1112	open INFILE, "$output_filestem.text.tmp" \|\|
1113	die "Couldn't open file: $!";
1114	open OUTFILE, ">$output_filestem.text" \|\|
1115	die "Couldn't open file for writing: $!";
1116	my $line="";
1117	while ($line=<INFILE>) {
1118	while (length($line)>0) {
1119	if (length($line)>$wrap_length) {
1120	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1121	print OUTFILE "$1\n";
1122	} else {
1123	print OUTFILE "$line";
1124	$line="";
1125	}
1126	}
1127	}
1128	close INFILE;
1129	close OUTFILE;
1130	&util::rm("$output_filestem.text.tmp");
1131
1132	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1133	return 1;
1134	}
1135
1136
1137	# Convert a PS file to various types of image with the convert utility
1138	sub ps_to_img {
1139	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
1140
1141	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
1142	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
1143	my $result = `identify 2>&1`;
1144	if ($? == -1 \|\| $? == 256) { # Linux and Windows return different values for "program not found"
1145	#ImageMagick is not installed, thus the convert utility is not available.
1146	print STDERR "*** ImageMagick is not installed, the convert utility is not available\n";
1147	return 0;
1148	}
1149	}
1150
1151	$cmd = "";
1152	if ($timeout) {$cmd = "ulimit -t $timeout;";}
1153	$output_type =~ s/.\_(.)/$1/i;
1154	$cmd .= "perl -S pstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
1155	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000) {
1156	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1157	} else {
1158	$cmd .= " > \"$output_filestem.err\"";
1159	}
1160
1161	# don't include path on windows (to avoid having to play about
1162	# with quoting when GSDLHOME might contain spaces) but assume
1163	# that the PATH is set up correctly
1164	$!=0;
1165	my $retval=system($cmd);
1166	if ($retval!=0)
1167	{
1168	print STDERR "Error executing pstoimg.pl";
1169	if ($!) {print STDERR ": $!";}
1170	print STDERR "\n";
1171	}
1172
1173	#make sure the converter made something
1174	#if ($retval !=0) \|\| ! -s "$output_filestem")
1175	if ($retval !=0)
1176	{
1177	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1178	#print out the converter's std err, if any
1179	if (-s "$output_filestem.err") {
1180	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1181	print STDERR "pstoimg error log:\n";
1182	while (<ERRLOG>) {
1183	print STDERR "$_";
1184	}
1185	close ERRLOG;
1186	}
1187	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1188	if (-e "$output_filestem.err") {
1189	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1190	{
1191	open (ERRLOG, "$output_filestem.err");
1192	while (<ERRLOG>) {print FAILLOG $_;}
1193	close ERRLOG;
1194	close FAILLOG;
1195	}
1196	&util::rm("$output_filestem.err");
1197	}
1198	return 0;
1199	}
1200	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1201	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1202	return 1;
1203	}
1204
1205	# Convert any file to HTML with a crude perl implementation of the
1206	# UNIX strings command.
1207
1208	sub any_to_html {
1209	($input_filename, $output_filestem) = @_;
1210
1211	# First generate a text file
1212	return 0 unless (&any_to_text($input_filename, $output_filestem));
1213
1214	# create an HTML file from the text file
1215	open(TEXT, "<$output_filestem.text");
1216	open(HTML, ">$output_filestem.html");
1217
1218	print HTML "<html><head>\n";
1219	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1220	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1221	print HTML "</head><body>\n\n";
1222
1223	my $line;
1224	while ($line=<TEXT>) {
1225	$line =~ s/</</g;
1226	$line =~ s/>/>/g;
1227	if ($line =~ /^\s*$/) {
1228	print HTML "<p>";
1229	} else {
1230	print HTML "<br> ", $line;
1231	}
1232	}
1233	print HTML "\n</body></html>\n";
1234
1235	close HTML;
1236	close TEXT;
1237
1238	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1239	return 1;
1240	}
1241
1242	# Convert any file to TEXT with a crude perl implementation of the
1243	# UNIX strings command.
1244	# Note - this assumes ascii charsets :( (jrm21)
1245
1246	sub any_to_text {
1247	($input_filename, $output_filestem) = @_;
1248
1249	if (!$use_strings) {
1250	return 0;
1251	}
1252
1253	open(IN, "<$input_filename") \|\| return 0;
1254	binmode(IN);
1255	open(OUT, ">$output_filestem.text") \|\| return 0;
1256
1257	my ($line);
1258	my $output_line_count = 0;
1259	while (<IN>) {
1260	$line = $_;
1261
1262	# delete anything that isn't a printable character
1263	$line =~ s/[^\040-\176]+/\n/sg;
1264
1265	# delete any string less than 10 characters long
1266	$line =~ s/^.{0,9}$/\n/mg;
1267	while ($line =~ /^.{1,9}$/m) {
1268	$line =~ s/^.{0,9}$/\n/mg;
1269	$line =~ s/\n+/\n/sg;
1270	}
1271
1272	# remove extraneous whitespace
1273	$line =~ s/\n+/\n/gs;
1274	$line =~ s/^\n//gs;
1275
1276	# output whatever is left
1277	if ($line =~ /[^\n ]/) {
1278	print OUT $line;
1279	++$output_line_count;
1280	}
1281	}
1282
1283	close OUT;
1284	close IN;
1285
1286	if ($output_line_count) { # try to protect against binary only formats
1287	return 1;
1288	}
1289
1290	&util::rm("$output_filestem.text");
1291	return 0;
1292
1293	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: