Context Navigation

source: trunk/gsdl/bin/script/gsConvert.pl@ 10493

Last change on this file since 10493 was 10464, checked in by chi, 19 years ago
Modifications of the if loop condition in convertPPT() to allow the different types of convert_to can be dealt with properly.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 34.0 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use parsargv;
52	use util;
53	use Cwd;
54	use File::Basename;
55
56	# Are we running on WinNT or Win2000 (or later)?
57	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
58	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
59
60	my $use_strings;
61	my $pdf_complex;
62	my $pdf_nohidden;
63	my $pdf_zoom;
64	my $pdf_ignore_images;
65	my $pdf_allow_images_only;
66	my $windows_scripting;
67
68	sub print_usage
69	{
70	print STDERR "\n";
71	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
72	print STDERR " or text using third-party programs.\n\n";
73	print STDERR " usage: $0 [options] filename\n";
74	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
75	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
76	print STDERR "\t-output\tauto\|html\|text\|pagedimg-jpg\|pagedimg-gif\|pagedimg-png\t(output file type)\n";
77	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
78	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
79	print STDERR "\t-windows_scripting\tuse windows script when converting Microsoft Word and PPT via VB script\n";
80	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
81	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
82	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
83	print STDERR "\t\tconverting PDF to HTML\n";
84	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
85	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
86	print STDERR "\t\t-pdf_complex is set\n";
87	exit(1);
88	}
89
90	my $faillogfile="";
91	my $timeout=0;
92
93	sub main
94	{
95	my (@ARGV) = @_;
96	my ($input_type,$output_type,$verbose);
97
98	# read command-line arguments
99	if (!parsargv::parse(\@ARGV,
100	'type/(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)/', \$input_type,
101	'/errlog/.*/', \$faillogfile,
102	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
103	'timeout/\d+/0',\$timeout,
104	'verbose/\d+/0', \$verbose,
105	'use_strings', \$use_strings,
106	'windows_scripting',\$windows_scripting,
107	'pdf_complex', \$pdf_complex,
108	'pdf_ignore_images', \$pdf_ignore_images,
109	'pdf_allow_images_only', \$pdf_allow_images_only,
110	'pdf_nohidden', \$pdf_nohidden,
111	'pdf_zoom/\d+/2', \$pdf_zoom
112	))
113	{
114	print_usage();
115	}
116
117
118	# Make sure the input file exists and can be opened for reading
119	if (scalar(@ARGV!=1)) {
120	print_usage();
121	}
122
123	my $input_filename = $ARGV[0];
124	if (!-r $input_filename) {
125	print STDERR "Error: unable to open $input_filename for reading\n";
126	exit(1);
127	}
128
129	# Deduce filenames
130	my ($tailname,$dirname,$suffix)
131	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
132	my $output_filestem = &util::filename_cat($dirname, "$tailname");
133
134	if ($input_type eq "")
135	{
136	$input_type = lc (substr($suffix,1,length($suffix)-1));
137	}
138
139	# Change to temporary working directory
140	my $stored_dir = cwd();
141	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
142
143	# Select convert utility
144	if (!defined $input_type) {
145	print STDERR "Error: No filename extension or input type defined\n";
146	exit(1);
147	}
148	elsif ($input_type eq "doc" \|\| $input_type eq "dot") {
149	print &convertDOC($input_filename, $output_filestem, $output_type);
150	print "\n";
151	}
152	elsif ($input_type eq "rtf") {
153	print &convertRTF($input_filename, $output_filestem, $output_type);
154	print "\n";
155	}
156	elsif ($input_type eq "pdf") {
157	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
158	print "\n";
159	}
160	elsif ($input_type eq "ps") {
161	print &convertPS($input_filename, $output_filestem, $output_type);
162	print "\n";
163	}
164	elsif ($input_type eq "ppt") {
165	print &convertPPT($input_filename, $output_filestem, $output_type);
166	print "\n";
167	}
168	elsif ($input_type eq "xls") {
169	print &convertXLS($input_filename, $output_filestem, $output_type);
170	print "\n";
171	}
172	else {
173	print STDERR "Error: Unable to convert type '$input_type'\n";
174	exit(1);
175	}
176
177	# restore to original working directory
178	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
179
180	}
181
182	&main(@ARGV);
183
184
185
186	# Document-type conversion functions
187	#
188	# The following functions attempt to convert documents from their
189	# input type to the specified output type. If no output type was
190	# given, then they first attempt HTML, and then TEXT.
191	#
192	# Each returns the output type ("html" or "text") or "fail" if no
193	# conversion is possible.
194
195	# Convert a Microsoft word document
196
197	sub convertDOC {
198	($input_filename, $output_filestem, $output_type) = @_;
199
200	# Many .doc files are not in fact word documents!
201	my $realtype = &find_docfile_type($input_filename);
202
203	if ($realtype eq "word6" \|\| $realtype eq "word7" \|\| $realtype eq "word8") {
204	return &convertWord678($input_filename, $output_filestem, $output_type);
205	} elsif ($realtype eq "rtf") {
206	return &convertRTF($input_filename, $output_filestem, $output_type);
207	} else {
208	return &convertAnything($input_filename, $output_filestem, $output_type);
209	}
210	}
211
212	# Convert a Microsoft word 6/7/8 document
213
214	sub convertWord678 {
215	($input_filename, $output_filestem, $output_type) = @_;
216
217	my $success = 0;
218	if (!$output_type \|\| ($output_type =~ /html/i)){
219	if ($windows_scripting) {
220	$success = &native_doc_to_html($input_filename, $output_filestem);
221	}
222	else {
223	$success = &doc_to_html($input_filename, $output_filestem);
224	}
225	if ($success) {
226	return "html";
227	}
228	}
229
230	return &convertAnything($input_filename, $output_filestem, $output_type);
231	}
232
233
234	# Convert a Rich Text Format (RTF) file
235
236	sub convertRTF {
237	($input_filename, $output_filestem, $output_type) = @_;
238
239	my $success = 0;
240
241	# Attempt specialised conversion to HTML
242	if (!$output_type \|\| ($output_type =~ /html/i)) {
243	$success = &rtf_to_html($input_filename, $output_filestem);
244	if ($success) {
245	return "html";
246	}
247	}
248
249	# rtf is so ugly that's it's not worth running strings over.
250	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
251	# return &convertAnything($input_filename, $output_filestem, $output_type);
252	return "fail";
253	}
254
255
256	# Convert an unidentified file
257
258	sub convertAnything {
259	($input_filename, $output_filestem, $output_type) = @_;
260
261	my $success = 0;
262
263	# Attempt simple conversion to HTML
264	if (!$output_type \|\| ($output_type =~ /html/i)) {
265	$success = &any_to_html($input_filename, $output_filestem);
266	if ($success) {
267	return "html";
268	}
269	}
270
271	# Convert to text
272	if (!$output_type \|\| ($output_type =~ /text/i)) {
273	$success = &any_to_text($input_filename, $output_filestem);
274	if ($success) {
275	return "text";
276	}
277	}
278	return "fail";
279	}
280
281
282
283	# Convert an Adobe PDF document
284
285	sub convertPDF {
286	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
287
288	my $success = 0;
289	$output_type =~ s/.\-(.)/$1/i;
290	# Attempt coversion to Image
291	if ($output_type =~ /jp?g\|gif\|png/i) {
292	$success = &pdf_to_img($dirname, $input_filename, $output_filestem, $output_type);
293	if ($success){
294	return "item";
295	}
296	}
297
298	# Attempt conversion to HTML
299	if (!$output_type \|\| ($output_type =~ /html/i)) {
300	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
301	if ($success) {
302	return "html";
303	}
304	}
305
306	# Attempt conversion to TEXT
307	if (!$output_type \|\| ($output_type =~ /text/i)) {
308	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
309	if ($success) {
310	return "text";
311	}
312	}
313
314	return "fail";
315
316	}
317
318
319	# Convert an Adobe PostScript document
320
321	sub convertPS {
322	($input_filename, $output_filestem, $output_type) = @_;
323
324	my $success = 0;
325
326	# Attempt conversion to TEXT
327	if (!$output_type \|\| ($output_type =~ /text/i)) {
328	$success = &ps_to_text($input_filename, $output_filestem);
329	if ($success) {
330	return "text";
331	}
332	}
333	return "fail";
334	}
335
336
337	sub convertPPT {
338	my ($input_filename, $output_filestem, $output_type) = @_;
339	my $success = 0;
340
341	my $ppt_convert_type = "";
342	#if (!$output_type \|\| $windows_scripting \|\|($output_type !~ /html/i) \|\|($output_type !~ /text/i)){
343	if ($windows_scripting && ($output_type !~ /html/i) && ($output_type !~ /text/i)){
344	if ($output_type =~ /gif/i) {
345	$ppt_convert_type = "-g";
346	} elsif ($output_type =~ /jp?g/i){
347	$ppt_convert_type = "-j";
348	} elsif ($output_type =~ /png/i){
349	$ppt_convert_type = "-p";
350	}
351	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
352	$ENV{'GSDLOS'}, "pptextract");
353	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ /^windows$/i);
354
355	$cmd = "";
356	if ($timeout) {$cmd = "ulimit -t $timeout;";}
357	# if the converting directory has already existed
358	if (-d $output_filestem) {
359	print STDERR "**The conversion directory has existed\n";
360	return "item";
361	} else {
362	$cmd .= "$vbScript $ppt_convert_type $input_filename $output_filestem";
363	$cmd .= " 2>\"$output_filestem.err\""
364	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
365	if (system($cmd) !=0) {
366	print STDERR "Powerpoint VB Scripting convert failed\n";
367	} else {
368	return "item";
369	}
370	}
371	} elsif (!$output_type \|\| ($output_type =~ /html/i)) {
372	# Attempt conversion to HTML
373	#if (!$output_type \|\| ($output_type =~ /html/i)) {
374	# formulate the command
375	$cmd = "";
376	$cmd .= "perl -S ppttohtml.pl ";
377	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
378	$cmd .= " 2>\"$output_filestem.err\""
379	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
380
381	# execute the command
382	$!=0;
383	if (system($cmd)!=0)
384	{
385	print STDERR "Powerpoint 95/97 converter failed $!\n";
386	} else {
387	return "html";
388	}
389	}
390
391	$success = &any_to_text($input_filename, $output_filestem);
392	if ($success) {
393	return "text";
394	}
395
396	return "fail";
397	}
398
399
400	sub convertXLS {
401	my ($input_filename, $output_filestem, $output_type) = @_;
402
403	my $success = 0;
404
405	# Attempt conversion to HTML
406	if (!$output_type \|\| ($output_type =~ /html/i)) {
407	# formulate the command
408	$cmd = "";
409	$cmd .= "perl -S xlstohtml.pl ";
410	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
411	$cmd .= " 2>\"$output_filestem.err\""
412	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
413
414
415	# execute the command
416	$!=0;
417	if (system($cmd)!=0)
418	{
419	print STDERR "Excel 95/97 converter failed $!\n";
420	} else {
421	return "html";
422	}
423	}
424
425	$success = &any_to_text($input_filename, $output_filestem);
426	if ($success) {
427	return "text";
428	}
429
430	return "fail";
431	}
432
433
434
435	# Find the real type of a .doc file
436	#
437	# We seem to have a lot of files with a .doc extension that are .rtf
438	# files or Word 5 files. This function attempts to tell the difference.
439	sub find_docfile_type {
440	($input_filename) = @_;
441
442	open(CHK, "<$input_filename");
443	binmode(CHK);
444	my $line = "";
445	my $first = 1;
446
447	while (<CHK>) {
448
449	$line = $_;
450
451	if ($first) {
452	# check to see if this is an rtf file
453	if ($line =~ /^\{\\rtf/) {
454	close(CHK);
455	return "rtf";
456	}
457	$first = 0;
458	}
459
460	# is this is a word 6/7/8 document?
461	if ($line =~ /Word\.Document\.([678])/) {
462	close(CHK);
463	return "word$1";
464	}
465
466	}
467
468	return "unknown";
469	}
470
471
472	# Specific type-to-type conversions
473	#
474	# Each of the following functions attempts to convert a document from
475	# a specific format to another. If they succeed they return 1 and leave
476	# the output document(s) in the appropriate place; if they fail they
477	# return 0 and delete any working files.
478
479
480	# Attempt to convert a word document to html with the wv program
481	sub doc_to_html {
482	($input_filename, $output_filestem) = @_;
483
484	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
485	$ENV{'GSDLOS'}, "wvWare");
486
487	# don't include path on windows (to avoid having to play about
488	# with quoting when GSDLHOME might contain spaces) but assume
489	# that the PATH is set up correctly
490	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
491
492	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
493	"packages", "wv", "wvHtml.xml");
494
495	my $cmd = "";
496	if ($timeout) {$cmd = "ulimit -t $timeout;";}
497	$cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
498	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
499
500	# redirecting STDERR is a bad idea on windows 95/98
501	$cmd .= " 2> \"$output_filestem.err\""
502	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
503	# execute the command
504	$!=0;
505	if (system($cmd)!=0)
506	{
507	print STDERR "Error executing wv converter:$!\n";
508	if (-s "$output_filestem.err") {
509	open (ERRFILE, "<$output_filestem.err");
510
511	my $write_to_fail_log=0;
512	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
513	{$write_to_fail_log=1;}
514
515	my $line;
516	while ($line=<ERRFILE>) {
517	if ($line =~ /\w/) {
518	print STDERR "$line";
519	print FAILLOG "$line" if ($write_to_fail_log);
520	}
521	if ($line !~ m/startup error/) {next;}
522	print STDERR " (given an invalid .DOC file?)\n";
523	print FAILLOG " (given an invalid .DOC file?)\n"
524	if ($write_to_fail_log);
525
526	} # while ERRFILE
527	close FAILLOG if ($write_to_fail_log);
528	}
529	return 0; # we can try any_to_text
530	}
531
532	# Was the conversion successful?
533
534	if (-s "$output_filestem.html") {
535	open(TMP, "$output_filestem.html");
536	$line = <TMP>;
537	close(TMP);
538	if ($line && $line =~ /DOCTYPE HTML/) {
539	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
540	return 1;
541	}
542	}
543
544	# If here, an error of some sort occurred
545	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
546	if (-e "$output_filestem.err") {
547	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
548	open (ERRLOG,"$output_filestem.err");
549	while (<ERRLOG>) {print FAILLOG $_;}
550	close FAILLOG;
551	close ERRLOG;
552	}
553	&util::rm("$output_filestem.err");
554	}
555
556	return 0;
557	}
558
559
560	# Attempt to convert a word document to html with the word2html scripting program
561	sub native_doc_to_html {
562	($input_filename, $output_filestem) = @_;
563
564	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
565	$ENV{'GSDLOS'}, "word2html");
566
567	$vbScript = "word2html" if ($ENV{'GSDLOS'} =~ /^windows$/i);
568	if (-e "$output_filestem.html") {
569	print STDERR "*** The conversion file has existed\n";
570	return 1;
571	}
572
573	my $cmd = "";
574	if ($timeout) {$cmd = "ulimit -t $timeout;";}
575	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
576	#$cmd .= "$vbScript $input_filename $output_filestem.html";
577	$cmd .= "$vbScript $input_filename $output_filestem.html";
578
579	# redirecting STDERR
580	$cmd .= " 2> \"$output_filestem.err\""
581	if ($ENV {'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
582
583	# execute the command
584	$!=0;
585	if (system($cmd)!=0)
586	{
587	print STDERR "Error executing word2Html converter:$!\n";
588	if (-s "$output_filestem.err") {
589	open (ERRFILE, "<$output_filestem.err");
590
591	my $write_to_fail_log=0;
592	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
593	{$write_to_fail_log=1;}
594
595	my $line;
596	while ($line=<ERRFILE>) {
597	if ($line =~ /\w/) {
598	print STDERR "$line";
599	print FAILLOG "$line" if ($write_to_fail_log);
600	}
601	if ($line !~ m/startup error/) {next;}
602	print STDERR " (given an invalid .DOC file?)\n";
603	print FAILLOG " (given an invalid .DOC file?)\n"
604	if ($write_to_fail_log);
605
606	} # while ERRFILE
607	close FAILLOG if ($write_to_fail_log);
608	}
609	return 0; # we can try any_to_text
610	}
611
612	# Was the conversion successful?
613	if (-s "$output_filestem.html") {
614	open(TMP, "$output_filestem.html");
615	$line = <TMP>;
616	close(TMP);
617	if ($line && $line =~ /html/) {
618	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
619	return 1;
620	}
621	}
622
623	# If here, an error of some sort occurred
624	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
625	if (-e "$output_filestem.err") {
626	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
627	open (ERRLOG,"$output_filestem.err");
628	while (<ERRLOG>) {print FAILLOG $_;}
629	close FAILLOG;
630	close ERRLOG;
631	}
632	&util::rm("$output_filestem.err");
633	}
634	return 0;
635	}
636
637	# Attempt to convert an RTF document to html with rtftohtml
638
639	sub rtf_to_html {
640	my ($input_filename, $output_filestem) = @_;
641
642	# formulate the command
643	$cmd = "";
644	if ($timeout) {$cmd = "ulimit -t $timeout;";}
645	$cmd .= "rtftohtml";
646	#$cmd .= "rtf-converter";
647
648	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
649
650	$cmd .= " 2>\"$output_filestem.err\""
651	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
652
653
654	# execute the command
655	$!=0;
656	if (system($cmd)!=0)
657	{
658	print STDERR "Error executing rtf converter $!\n";
659	# don't currently bother printing out error log...
660	# keep going, in case it still created an HTML file...
661	}
662
663	# Was the conversion successful?
664	my $was_successful=0;
665	if (-s "$output_filestem.html") {
666	# make sure we have some content other than header
667	open (HTML, "$output_filestem.html"); # what to do if fail?
668	my $line;
669	my $past_header=0;
670	while ($line=<HTML>) {
671
672	if ($past_header == 0) {
673	if ($line =~ /<body>/) {$past_header=1;}
674	next;
675	}
676
677	$line =~ s/<[^>]+>//g;
678	if ($line =~ /\w/ && $past_header) { # we found some content...
679	$was_successful=1;
680	last;
681	}
682	}
683	close HTML;
684	}
685
686	if ($was_successful) {
687	&util::rm("$output_filestem.err")
688	if (-e "$output_filestem.err");
689	# insert the (modified) table of contents, if it exists.
690	if (-e "${output_filestem}_ToC.html") {
691	&util::mv("$output_filestem.html","$output_filestem.src");
692	my $open_failed=0;
693	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
694	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
695	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
696
697	if ($open_failed) {
698	close HTMLSRC;
699	close TOC;
700	close HTML;
701	&util::mv("$output_filestem.src","$output_filestem.html");
702	return 1;
703	}
704
705	# print out header info from src html.
706	while (defined($_ = <HTMLSRC>) && $_ =~ /\w/) {
707	print HTML "$_";
708	}
709
710	# print out table of contents, making links relative
711	<TOC>; <TOC>; # ignore first 2 lines
712	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
713	my $line;
714	while ($line=<TOC>) {
715	$line =~ s@</body></html>$@@ ; # only last line has this
716	# make link relative
717	$line =~ s@href=\"[^\#]+@href=\"@;
718	print HTML $line;
719	}
720	close TOC;
721
722	# rest of html src
723	while (<HTMLSRC>) {
724	print HTML $_;
725	}
726	close HTMLSRC;
727	close HTML;
728
729	&util::rm("${output_filestem}_ToC.html");
730	&util::rm("${output_filestem}.src");
731	}
732	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
733	return 1; # success
734	}
735
736	if (-e "$output_filestem.err") {
737	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
738	{
739	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
740	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
741	print FAILLOG " (rtf file might be too recent):\n";
742	open (ERRLOG, "$output_filestem.err");
743	while (<ERRLOG>) {print FAILLOG $_;}
744	close ERRLOG;
745	close FAILLOG;
746	}
747	&util::rm("$output_filestem.err");
748	}
749
750	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
751
752	return 0;
753	}
754
755
756	# Convert a pdf file to html with the pdftohtml command
757
758	sub pdf_to_html {
759	my ($dirname, $input_filename, $output_filestem) = @_;
760
761	$cmd = "";
762	if ($timeout) {$cmd = "ulimit -t $timeout;";}
763	$cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
764	$cmd .= " -c" if ($pdf_complex);
765	$cmd .= " -i" if ($pdf_ignore_images);
766	$cmd .= " -a" if ($pdf_allow_images_only);
767	$cmd .= " -hidden" unless ($pdf_nohidden);
768	$cmd .= " \"$input_filename\" \"$output_filestem\"";
769
770	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000) {
771	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
772	} else {
773	$cmd .= " > \"$output_filestem.err\"";
774	}
775
776	$!=0;
777
778	my $retval=system($cmd);
779	if ($retval!=0)
780	{
781	print STDERR "Error executing pdftohtml.pl";
782	if ($!) {print STDERR ": $!";}
783	print STDERR "\n";
784	}
785
786	# make sure the converter made something
787	if ($retval!=0 \|\| ! -s "$output_filestem.html")
788	{
789	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
790	# print out the converter's std err, if any
791	if (-s "$output_filestem.err") {
792	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
793	print STDERR "pdftohtml error log:\n";
794	while (<ERRLOG>) {
795	print STDERR "$_";
796	}
797	close ERRLOG;
798	}
799	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
800	if (-e "$output_filestem.err") {
801	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
802	{
803	open (ERRLOG, "$output_filestem.err");
804	while (<ERRLOG>) {print FAILLOG $_;}
805	close ERRLOG;
806	close FAILLOG;
807	}
808	&util::rm("$output_filestem.err");
809	}
810	return 0;
811	}
812
813	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
814	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
815	return 1;
816	}
817
818	# Convert a pdf file to various types of image with the convert command
819
820	sub pdf_to_img {
821	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
822
823	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
824	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
825	my $result = `identify 2>&1`;
826	if ($? == -1 \|\| $? == 256) { # Linux and Windows return different values for "program not found"
827	#ImageMagick is not installed, thus the convert utility is not available.
828	print STDERR "*** ImageMagick is not installed, the convert utility is not available\n";
829	return 0;
830	}
831	}
832
833	$cmd = "";
834	if ($timeout) {$cmd = "ulimit -t $timeout;";}
835	$output_type =~ s/.\_(.)/$1/i;
836	$cmd .= "perl -S pdftoimg.pl -convert_to $output_type $input_filename $output_filestem";
837	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000) {
838	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
839	} else {
840	$cmd .= " > \"$output_filestem.err\"";
841	}
842
843	# don't include path on windows (to avoid having to play about
844	# with quoting when GSDLHOME might contain spaces) but assume
845	# that the PATH is set up correctly
846	$!=0;
847	my $retval=system($cmd);
848	if ($retval!=0)
849	{
850	print STDERR "Error executing pdftoimg.pl";
851	if ($!) {print STDERR ": $!";}
852	print STDERR "\n";
853	}
854
855	#make sure the converter made something
856	#if ($retval !=0) \|\| ! -s "$output_filestem")
857	if ($retval !=0)
858	{
859	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
860	#print out the converter's std err, if any
861	if (-s "$output_filestem.err") {
862	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
863	print STDERR "pdftoimg error log:\n";
864	while (<ERRLOG>) {
865	print STDERR "$_";
866	}
867	close ERRLOG;
868	}
869	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
870	if (-e "$output_filestem.err") {
871	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
872	{
873	open (ERRLOG, "$output_filestem.err");
874	while (<ERRLOG>) {print FAILLOG $_;}
875	close ERRLOG;
876	close FAILLOG;
877	}
878	&util::rm("$output_filestem.err");
879	}
880	return 0;
881	}
882	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
883	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
884	return 1;
885	}
886
887	# Convert a PDF file to text with the pdftotext command
888
889	sub pdf_to_text {
890	my ($dirname, $input_filename, $output_filestem) = @_;
891
892	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
893
894	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
895	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
896	} else {
897	$cmd .= " > \"$output_filestem.err\"";
898	}
899
900	if (system($cmd)!=0)
901	{
902	print STDERR "Error executing $cmd: $!\n";
903	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
904	}
905
906	# make sure there is some extracted text.
907	if (-e "$output_filestem.text") {
908	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
909	binmode(EXTR_TEXT); # just in case...
910	my $line="";
911	my $seen_text=0;
912	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
913	if ($line=~ /\w/) {$seen_text=1;}
914	}
915	close EXTR_TEXT;
916	if ($seen_text==0) { # no text was extracted
917	print STDERR "Error: pdftotext found no text\n";
918	&util::rm("$output_filestem.text");
919	}
920	}
921
922	# make sure the converter made something
923	if (! -s "$output_filestem.text")
924	{
925	# print out the converters std err, if any
926	if (-s "$output_filestem.err") {
927	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
928	print STDERR "pdftotext error log:\n";
929	while (<ERRLOG>) {
930	print STDERR "$_";
931	}
932	close ERRLOG;
933	}
934	# does this converter create a .out file?
935	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
936	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
937	if (-e "$output_filestem.err") {
938	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
939	{
940	open (ERRLOG,"$output_filestem.err");
941	while (<ERRLOG>) {print FAILLOG $_;}
942	close ERRLOG;
943	close FAILLOG;
944	}
945	&util::rm("$output_filestem.err");
946	}
947	return 0;
948	}
949	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
950	return 1;
951	}
952
953	# Convert a PostScript document to text
954	# note - just using "ps2ascii" isn't good enough, as it
955	# returns 0 for a postscript interpreter error. ps2ascii is just
956	# a wrapper to "gs" anyway, so we use that cmd here.
957
958	sub ps_to_text {
959	my ($input_filename, $output_filestem) = @_;
960
961	my $error = "";
962
963	# if we're on windows we'll fall straight through without attempting
964	# to use gs
965	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
966	$error = "Windows does not support gs";
967
968	} else {
969	my $cmd = "";
970	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
971	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
972	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
973	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
974	$cmd .= " 2> $output_filestem.err";
975	$!=0;
976
977	my $retcode=system($cmd);
978	$retcode = $? >> 8; # see man perlfunc - system for this...
979	# if system returns -1 \| 127 (couldn't start program), look at $! for message
980
981	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
982	elsif (! -e "$output_filestem.text") {
983	$error="did not create output file.\n";
984	}
985	else
986	{ # make sure the interpreter didn't get an error. It is technically
987	# possible for the actual text to start with this, but....
988	open PSOUT, "$output_filestem.text";
989	if (<PSOUT> =~ /^Error: (.*)/) {
990	$error="interpreter error - \"$1\"";
991	}
992	close PSOUT;
993	}
994	}
995
996	if ($error ne "")
997	{
998	print STDERR "Warning: Error executing gs: $error\n";
999	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1000
1001	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1002	{
1003	print FAILLOG "gs - $error\n";
1004	if (-e "$output_filestem.err") {
1005	open(ERRLOG, "$output_filestem.err");
1006	while (<ERRLOG>) {print FAILLOG $_;}
1007	close ERRLOG;
1008	}
1009	close FAILLOG;
1010	}
1011	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1012
1013
1014	# Fine then. We'll just do a lousy job by ourselves...
1015	# Based on 5-line regexp sed script found at:
1016	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1017	#
1018	print STDERR "Stripping text from postscript\n";
1019	my $errorcode=0;
1020	open (IN, "$input_filename")
1021	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1022	open (OUT, ">$output_filestem.text")
1023	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1024	if ($errorcode) {print STDERR "errors\n";return 0;}
1025
1026	my $text=""; # this is for whole .ps file...
1027	$text = join('', <IN>); # see man perlport, under "System Resources"
1028	close IN;
1029
1030	# Make sure this is a ps file...
1031	if ($text !~ /^%!/) {
1032	print STDERR "Bad postscript header: not '%!'\n";
1033	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1034	{
1035	print FAILLOG "Bad postscript header: not '%!'\n";
1036	close FAILLOG;
1037	}
1038	return 0;
1039	}
1040
1041	# if ps has Page data, then use it to delete all stuff before it.
1042	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1043
1044	# remove all leading non-data stuff
1045	$text =~ s/^.*?\(//s;
1046
1047	# remove all newline chars for easier processing
1048	$text =~ s/\n//g;
1049
1050	# Big assumption here - assume that if any co-ordinates are
1051	# given, then we are at the end of a sentence.
1052	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1053
1054	# special characters--
1055	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1056
1057	# ? ps text formatting (eg italics?) ?
1058	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1059	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1060	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1061	# default - remove the rest
1062	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1063
1064	# attempt to add whitespace between words...
1065	# this is based purely on observation, and may be completely wrong...
1066	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1067	# eg I notice "b(" is sometimes NOT a space if preceded by a
1068	# negative number.
1069	$text =~ s/\)\d+ ?b\(/\) \( /g;
1070
1071	# change quoted braces to brackets
1072	$text =~ s/([^\\])\\\(/$1\{/g;
1073	$text =~ s/([^\\])\\\)/$1\}/g ;
1074
1075	# remove everything that is not between braces
1076	$text =~ s/\)([^\(\)])+?\(//sg ;
1077
1078	# remove any Trailer eof stuff.
1079	$text =~ s/\)[^\)]*$//sg;
1080
1081	### ligatures have special characters...
1082	$text =~ s/\\013/ff/g;
1083	$text =~ s/\\014/fi/g;
1084	$text =~ s/\\015/fl/g;
1085	$text =~ s/\\016/ffi/g;
1086	$text =~ s/\\214/fi/g;
1087	$text =~ s/\\215/fl/g;
1088	$text =~ s/\\017/\n\* /g; # asterisk?
1089	$text =~ s/\\023/\023/g; # e acute ('e)
1090	$text =~ s/\\177/\252/g; # u"
1091	# $text =~ s/ ?? /\344/g; # a"
1092
1093	print OUT "$text";
1094	close OUT;
1095	}
1096	# wrap the text - use a minimum length. ie, first space after this length.
1097	my $wrap_length=72;
1098	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1099	open INFILE, "$output_filestem.text.tmp" \|\|
1100	die "Couldn't open file: $!";
1101	open OUTFILE, ">$output_filestem.text" \|\|
1102	die "Couldn't open file for writing: $!";
1103	my $line="";
1104	while ($line=<INFILE>) {
1105	while (length($line)>0) {
1106	if (length($line)>$wrap_length) {
1107	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1108	print OUTFILE "$1\n";
1109	} else {
1110	print OUTFILE "$line";
1111	$line="";
1112	}
1113	}
1114	}
1115	close INFILE;
1116	close OUTFILE;
1117	&util::rm("$output_filestem.text.tmp");
1118
1119	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1120	return 1;
1121	}
1122
1123
1124	# Convert any file to HTML with a crude perl implementation of the
1125	# UNIX strings command.
1126
1127	sub any_to_html {
1128	($input_filename, $output_filestem) = @_;
1129
1130	# First generate a text file
1131	return 0 unless (&any_to_text($input_filename, $output_filestem));
1132
1133	# create an HTML file from the text file
1134	open(TEXT, "<$output_filestem.text");
1135	open(HTML, ">$output_filestem.html");
1136
1137	print HTML "<html><head>\n";
1138	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1139	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1140	print HTML "</head><body>\n\n";
1141
1142	my $line;
1143	while ($line=<TEXT>) {
1144	$line =~ s/</</g;
1145	$line =~ s/>/>/g;
1146	if ($line =~ /^\s*$/) {
1147	print HTML "<p>";
1148	} else {
1149	print HTML "<br> ", $line;
1150	}
1151	}
1152	print HTML "\n</body></html>\n";
1153
1154	close HTML;
1155	close TEXT;
1156
1157	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1158	return 1;
1159	}
1160
1161	# Convert any file to TEXT with a crude perl implementation of the
1162	# UNIX strings command.
1163	# Note - this assumes ascii charsets :( (jrm21)
1164
1165	sub any_to_text {
1166	($input_filename, $output_filestem) = @_;
1167
1168	if (!$use_strings) {
1169	return 0;
1170	}
1171
1172	open(IN, "<$input_filename") \|\| return 0;
1173	binmode(IN);
1174	open(OUT, ">$output_filestem.text") \|\| return 0;
1175
1176	my ($line);
1177	my $output_line_count = 0;
1178	while (<IN>) {
1179	$line = $_;
1180
1181	# delete anything that isn't a printable character
1182	$line =~ s/[^\040-\176]+/\n/sg;
1183
1184	# delete any string less than 10 characters long
1185	$line =~ s/^.{0,9}$/\n/mg;
1186	while ($line =~ /^.{1,9}$/m) {
1187	$line =~ s/^.{0,9}$/\n/mg;
1188	$line =~ s/\n+/\n/sg;
1189	}
1190
1191	# remove extraneous whitespace
1192	$line =~ s/\n+/\n/gs;
1193	$line =~ s/^\n//gs;
1194
1195	# output whatever is left
1196	if ($line =~ /[^\n ]/) {
1197	print OUT $line;
1198	++$output_line_count;
1199	}
1200	}
1201
1202	close OUT;
1203	close IN;
1204
1205	if ($output_line_count) { # try to protect against binary only formats
1206	return 1;
1207	}
1208
1209	&util::rm("$output_filestem.text");
1210	return 0;
1211
1212	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: