Context Navigation

source: trunk/gsdl/bin/script/gsConvert.pl@ 11487

Last change on this file since 11487 was 10534, checked in by chi, 19 years ago
Adding pagedimg types of conversion to PS document.It is through Convert utility of ImageMagick to convert PS documents to different types of image (JPEG, GIF, PNG).
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 36.6 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use parsargv;
52	use util;
53	use Cwd;
54	use File::Basename;
55
56	# Are we running on WinNT or Win2000 (or later)?
57	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
58	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
59
60	my $use_strings;
61	my $pdf_complex;
62	my $pdf_nohidden;
63	my $pdf_zoom;
64	my $pdf_ignore_images;
65	my $pdf_allow_images_only;
66	my $windows_scripting;
67
68	sub print_usage
69	{
70	print STDERR "\n";
71	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
72	print STDERR " or text using third-party programs.\n\n";
73	print STDERR " usage: $0 [options] filename\n";
74	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
75	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
76	print STDERR "\t-output\tauto\|html\|text\|pagedimg-jpg\|pagedimg-gif\|pagedimg-png\t(output file type)\n";
77	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
78	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
79	print STDERR "\t-windows_scripting\tuse windows script when converting Microsoft Word and PPT via VB script\n";
80	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
81	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
82	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
83	print STDERR "\t\tconverting PDF to HTML\n";
84	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
85	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
86	print STDERR "\t\t-pdf_complex is set\n";
87	exit(1);
88	}
89
90	my $faillogfile="";
91	my $timeout=0;
92
93	sub main
94	{
95	my (@ARGV) = @_;
96	my ($input_type,$output_type,$verbose);
97
98	# read command-line arguments
99	if (!parsargv::parse(\@ARGV,
100	'type/(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)/', \$input_type,
101	'/errlog/.*/', \$faillogfile,
102	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
103	'timeout/\d+/0',\$timeout,
104	'verbose/\d+/0', \$verbose,
105	'use_strings', \$use_strings,
106	'windows_scripting',\$windows_scripting,
107	'pdf_complex', \$pdf_complex,
108	'pdf_ignore_images', \$pdf_ignore_images,
109	'pdf_allow_images_only', \$pdf_allow_images_only,
110	'pdf_nohidden', \$pdf_nohidden,
111	'pdf_zoom/\d+/2', \$pdf_zoom
112	))
113	{
114	print_usage();
115	}
116
117
118	# Make sure the input file exists and can be opened for reading
119	if (scalar(@ARGV!=1)) {
120	print_usage();
121	}
122
123	my $input_filename = $ARGV[0];
124	if (!-r $input_filename) {
125	print STDERR "Error: unable to open $input_filename for reading\n";
126	exit(1);
127	}
128
129	# Deduce filenames
130	my ($tailname,$dirname,$suffix)
131	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
132	my $output_filestem = &util::filename_cat($dirname, "$tailname");
133
134	if ($input_type eq "")
135	{
136	$input_type = lc (substr($suffix,1,length($suffix)-1));
137	}
138
139	# Change to temporary working directory
140	my $stored_dir = cwd();
141	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
142
143	# Select convert utility
144	if (!defined $input_type) {
145	print STDERR "Error: No filename extension or input type defined\n";
146	exit(1);
147	}
148	elsif ($input_type eq "doc" \|\| $input_type eq "dot") {
149	print &convertDOC($input_filename, $output_filestem, $output_type);
150	print "\n";
151	}
152	elsif ($input_type eq "rtf") {
153	print &convertRTF($input_filename, $output_filestem, $output_type);
154	print "\n";
155	}
156	elsif ($input_type eq "pdf") {
157	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
158	print "\n";
159	}
160	elsif ($input_type eq "ps") {
161	print &convertPS($input_filename, $output_filestem, $output_type);
162	print "\n";
163	}
164	elsif ($input_type eq "ppt") {
165	print &convertPPT($input_filename, $output_filestem, $output_type);
166	print "\n";
167	}
168	elsif ($input_type eq "xls") {
169	print &convertXLS($input_filename, $output_filestem, $output_type);
170	print "\n";
171	}
172	else {
173	print STDERR "Error: Unable to convert type '$input_type'\n";
174	exit(1);
175	}
176
177	# restore to original working directory
178	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
179
180	}
181
182	&main(@ARGV);
183
184
185
186	# Document-type conversion functions
187	#
188	# The following functions attempt to convert documents from their
189	# input type to the specified output type. If no output type was
190	# given, then they first attempt HTML, and then TEXT.
191	#
192	# Each returns the output type ("html" or "text") or "fail" if no
193	# conversion is possible.
194
195	# Convert a Microsoft word document
196
197	sub convertDOC {
198	($input_filename, $output_filestem, $output_type) = @_;
199
200	# Many .doc files are not in fact word documents!
201	my $realtype = &find_docfile_type($input_filename);
202
203	if ($realtype eq "word6" \|\| $realtype eq "word7" \|\| $realtype eq "word8") {
204	return &convertWord678($input_filename, $output_filestem, $output_type);
205	} elsif ($realtype eq "rtf") {
206	return &convertRTF($input_filename, $output_filestem, $output_type);
207	} else {
208	return &convertAnything($input_filename, $output_filestem, $output_type);
209	}
210	}
211
212	# Convert a Microsoft word 6/7/8 document
213
214	sub convertWord678 {
215	($input_filename, $output_filestem, $output_type) = @_;
216
217	my $success = 0;
218	if (!$output_type \|\| ($output_type =~ /html/i)){
219	if ($windows_scripting) {
220	$success = &native_doc_to_html($input_filename, $output_filestem);
221	}
222	else {
223	$success = &doc_to_html($input_filename, $output_filestem);
224	}
225	if ($success) {
226	return "html";
227	}
228	}
229
230	return &convertAnything($input_filename, $output_filestem, $output_type);
231	}
232
233
234	# Convert a Rich Text Format (RTF) file
235
236	sub convertRTF {
237	($input_filename, $output_filestem, $output_type) = @_;
238
239	my $success = 0;
240
241	# Attempt specialised conversion to HTML
242	if (!$output_type \|\| ($output_type =~ /html/i)) {
243	$success = &rtf_to_html($input_filename, $output_filestem);
244	if ($success) {
245	return "html";
246	}
247	}
248
249	# rtf is so ugly that's it's not worth running strings over.
250	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
251	# return &convertAnything($input_filename, $output_filestem, $output_type);
252	return "fail";
253	}
254
255
256	# Convert an unidentified file
257
258	sub convertAnything {
259	($input_filename, $output_filestem, $output_type) = @_;
260
261	my $success = 0;
262
263	# Attempt simple conversion to HTML
264	if (!$output_type \|\| ($output_type =~ /html/i)) {
265	$success = &any_to_html($input_filename, $output_filestem);
266	if ($success) {
267	return "html";
268	}
269	}
270
271	# Convert to text
272	if (!$output_type \|\| ($output_type =~ /text/i)) {
273	$success = &any_to_text($input_filename, $output_filestem);
274	if ($success) {
275	return "text";
276	}
277	}
278	return "fail";
279	}
280
281
282
283	# Convert an Adobe PDF document
284
285	sub convertPDF {
286	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
287
288	my $success = 0;
289	$output_type =~ s/.\-(.)/$1/i;
290	# Attempt coversion to Image
291	if ($output_type =~ /jp?g\|gif\|png/i) {
292	$success = &pdf_to_img($dirname, $input_filename, $output_filestem, $output_type);
293	if ($success){
294	return "item";
295	}
296	}
297
298	# Attempt conversion to HTML
299	if (!$output_type \|\| ($output_type =~ /html/i)) {
300	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
301	if ($success) {
302	return "html";
303	}
304	}
305
306	# Attempt conversion to TEXT
307	if (!$output_type \|\| ($output_type =~ /text/i)) {
308	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
309	if ($success) {
310	return "text";
311	}
312	}
313
314	return "fail";
315
316	}
317
318
319	# Convert an Adobe PostScript document
320
321	sub convertPS {
322	($input_filename, $output_filestem, $output_type) = @_;
323
324	my $success = 0;
325	$output_type =~ s/.\-(.)/$1/i;
326	# Attempt coversion to Image
327	if ($output_type =~ /jp?g\|gif\|png/i) {
328	$success = &ps_to_img($dirname, $input_filename, $output_filestem, $output_type);
329	if ($success){
330	return "item";
331	}
332	}
333
334	# Attempt conversion to TEXT
335	if (!$output_type \|\| ($output_type =~ /text/i)) {
336	$success = &ps_to_text($input_filename, $output_filestem);
337	if ($success) {
338	return "text";
339	}
340	}
341	return "fail";
342	}
343
344
345	sub convertPPT {
346	my ($input_filename, $output_filestem, $output_type) = @_;
347	my $success = 0;
348
349	my $ppt_convert_type = "";
350	#if (!$output_type \|\| $windows_scripting \|\|($output_type !~ /html/i) \|\|($output_type !~ /text/i)){
351	if ($windows_scripting && ($output_type !~ /html/i) && ($output_type !~ /text/i)){
352	if ($output_type =~ /gif/i) {
353	$ppt_convert_type = "-g";
354	} elsif ($output_type =~ /jp?g/i){
355	$ppt_convert_type = "-j";
356	} elsif ($output_type =~ /png/i){
357	$ppt_convert_type = "-p";
358	}
359	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
360	$ENV{'GSDLOS'}, "pptextract");
361	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ /^windows$/i);
362
363	$cmd = "";
364	if ($timeout) {$cmd = "ulimit -t $timeout;";}
365	# if the converting directory has already existed
366	if (-d $output_filestem) {
367	print STDERR "**The conversion directory has existed\n";
368	return "item";
369	} else {
370	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
371	$cmd .= " 2>\"$output_filestem.err\""
372	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
373	if (system($cmd) !=0) {
374	print STDERR "Powerpoint VB Scripting convert failed\n";
375	} else {
376	return "item";
377	}
378	}
379	} elsif (!$output_type \|\| ($output_type =~ /html/i)) {
380	# Attempt conversion to HTML
381	#if (!$output_type \|\| ($output_type =~ /html/i)) {
382	# formulate the command
383	$cmd = "";
384	$cmd .= "perl -S ppttohtml.pl ";
385	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
386	$cmd .= " 2>\"$output_filestem.err\""
387	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
388
389	# execute the command
390	$!=0;
391	if (system($cmd)!=0)
392	{
393	print STDERR "Powerpoint 95/97 converter failed $!\n";
394	} else {
395	return "html";
396	}
397	}
398
399	$success = &any_to_text($input_filename, $output_filestem);
400	if ($success) {
401	return "text";
402	}
403
404	return "fail";
405	}
406
407
408	sub convertXLS {
409	my ($input_filename, $output_filestem, $output_type) = @_;
410
411	my $success = 0;
412
413	# Attempt conversion to HTML
414	if (!$output_type \|\| ($output_type =~ /html/i)) {
415	# formulate the command
416	$cmd = "";
417	$cmd .= "perl -S xlstohtml.pl ";
418	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
419	$cmd .= " 2>\"$output_filestem.err\""
420	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
421
422
423	# execute the command
424	$!=0;
425	if (system($cmd)!=0)
426	{
427	print STDERR "Excel 95/97 converter failed $!\n";
428	} else {
429	return "html";
430	}
431	}
432
433	$success = &any_to_text($input_filename, $output_filestem);
434	if ($success) {
435	return "text";
436	}
437
438	return "fail";
439	}
440
441
442
443	# Find the real type of a .doc file
444	#
445	# We seem to have a lot of files with a .doc extension that are .rtf
446	# files or Word 5 files. This function attempts to tell the difference.
447	sub find_docfile_type {
448	($input_filename) = @_;
449
450	open(CHK, "<$input_filename");
451	binmode(CHK);
452	my $line = "";
453	my $first = 1;
454
455	while (<CHK>) {
456
457	$line = $_;
458
459	if ($first) {
460	# check to see if this is an rtf file
461	if ($line =~ /^\{\\rtf/) {
462	close(CHK);
463	return "rtf";
464	}
465	$first = 0;
466	}
467
468	# is this is a word 6/7/8 document?
469	if ($line =~ /Word\.Document\.([678])/) {
470	close(CHK);
471	return "word$1";
472	}
473
474	}
475
476	return "unknown";
477	}
478
479
480	# Specific type-to-type conversions
481	#
482	# Each of the following functions attempts to convert a document from
483	# a specific format to another. If they succeed they return 1 and leave
484	# the output document(s) in the appropriate place; if they fail they
485	# return 0 and delete any working files.
486
487
488	# Attempt to convert a word document to html with the wv program
489	sub doc_to_html {
490	($input_filename, $output_filestem) = @_;
491
492	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
493	$ENV{'GSDLOS'}, "wvWare");
494
495	# don't include path on windows (to avoid having to play about
496	# with quoting when GSDLHOME might contain spaces) but assume
497	# that the PATH is set up correctly
498	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
499
500	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
501	"packages", "wv", "wvHtml.xml");
502
503	my $cmd = "";
504	if ($timeout) {$cmd = "ulimit -t $timeout;";}
505	$cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
506	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
507
508	# redirecting STDERR is a bad idea on windows 95/98
509	$cmd .= " 2> \"$output_filestem.err\""
510	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
511	# execute the command
512	$!=0;
513	if (system($cmd)!=0)
514	{
515	print STDERR "Error executing wv converter:$!\n";
516	if (-s "$output_filestem.err") {
517	open (ERRFILE, "<$output_filestem.err");
518
519	my $write_to_fail_log=0;
520	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
521	{$write_to_fail_log=1;}
522
523	my $line;
524	while ($line=<ERRFILE>) {
525	if ($line =~ /\w/) {
526	print STDERR "$line";
527	print FAILLOG "$line" if ($write_to_fail_log);
528	}
529	if ($line !~ m/startup error/) {next;}
530	print STDERR " (given an invalid .DOC file?)\n";
531	print FAILLOG " (given an invalid .DOC file?)\n"
532	if ($write_to_fail_log);
533
534	} # while ERRFILE
535	close FAILLOG if ($write_to_fail_log);
536	}
537	return 0; # we can try any_to_text
538	}
539
540	# Was the conversion successful?
541
542	if (-s "$output_filestem.html") {
543	open(TMP, "$output_filestem.html");
544	$line = <TMP>;
545	close(TMP);
546	if ($line && $line =~ /DOCTYPE HTML/) {
547	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
548	return 1;
549	}
550	}
551
552	# If here, an error of some sort occurred
553	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
554	if (-e "$output_filestem.err") {
555	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
556	open (ERRLOG,"$output_filestem.err");
557	while (<ERRLOG>) {print FAILLOG $_;}
558	close FAILLOG;
559	close ERRLOG;
560	}
561	&util::rm("$output_filestem.err");
562	}
563
564	return 0;
565	}
566
567
568	# Attempt to convert a word document to html with the word2html scripting program
569	sub native_doc_to_html {
570	($input_filename, $output_filestem) = @_;
571
572	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
573	$ENV{'GSDLOS'}, "word2html");
574
575	$vbScript = "word2html" if ($ENV{'GSDLOS'} =~ /^windows$/i);
576	if (-e "$output_filestem.html") {
577	print STDERR "*** The conversion file has existed\n";
578	return 1;
579	}
580
581	my $cmd = "";
582	if ($timeout) {$cmd = "ulimit -t $timeout;";}
583	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
584	#$cmd .= "$vbScript $input_filename $output_filestem.html";
585	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
586
587	# redirecting STDERR
588	$cmd .= " 2> \"$output_filestem.err\""
589	if ($ENV {'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
590
591	# execute the command
592	$!=0;
593	if (system($cmd)!=0)
594	{
595	print STDERR "Error executing word2Html converter:$!\n";
596	if (-s "$output_filestem.err") {
597	open (ERRFILE, "<$output_filestem.err");
598
599	my $write_to_fail_log=0;
600	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
601	{$write_to_fail_log=1;}
602
603	my $line;
604	while ($line=<ERRFILE>) {
605	if ($line =~ /\w/) {
606	print STDERR "$line";
607	print FAILLOG "$line" if ($write_to_fail_log);
608	}
609	if ($line !~ m/startup error/) {next;}
610	print STDERR " (given an invalid .DOC file?)\n";
611	print FAILLOG " (given an invalid .DOC file?)\n"
612	if ($write_to_fail_log);
613
614	} # while ERRFILE
615	close FAILLOG if ($write_to_fail_log);
616	}
617	return 0; # we can try any_to_text
618	}
619
620	# Was the conversion successful?
621	if (-s "$output_filestem.html") {
622	open(TMP, "$output_filestem.html");
623	$line = <TMP>;
624	close(TMP);
625	if ($line && $line =~ /html/) {
626	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
627	return 1;
628	}
629	}
630
631	# If here, an error of some sort occurred
632	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
633	if (-e "$output_filestem.err") {
634	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
635	open (ERRLOG,"$output_filestem.err");
636	while (<ERRLOG>) {print FAILLOG $_;}
637	close FAILLOG;
638	close ERRLOG;
639	}
640	&util::rm("$output_filestem.err");
641	}
642	return 0;
643	}
644
645	# Attempt to convert an RTF document to html with rtftohtml
646
647	sub rtf_to_html {
648	my ($input_filename, $output_filestem) = @_;
649
650	# formulate the command
651	$cmd = "";
652	if ($timeout) {$cmd = "ulimit -t $timeout;";}
653	$cmd .= "rtftohtml";
654	#$cmd .= "rtf-converter";
655
656	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
657
658	$cmd .= " 2>\"$output_filestem.err\""
659	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
660
661
662	# execute the command
663	$!=0;
664	if (system($cmd)!=0)
665	{
666	print STDERR "Error executing rtf converter $!\n";
667	# don't currently bother printing out error log...
668	# keep going, in case it still created an HTML file...
669	}
670
671	# Was the conversion successful?
672	my $was_successful=0;
673	if (-s "$output_filestem.html") {
674	# make sure we have some content other than header
675	open (HTML, "$output_filestem.html"); # what to do if fail?
676	my $line;
677	my $past_header=0;
678	while ($line=<HTML>) {
679
680	if ($past_header == 0) {
681	if ($line =~ /<body>/) {$past_header=1;}
682	next;
683	}
684
685	$line =~ s/<[^>]+>//g;
686	if ($line =~ /\w/ && $past_header) { # we found some content...
687	$was_successful=1;
688	last;
689	}
690	}
691	close HTML;
692	}
693
694	if ($was_successful) {
695	&util::rm("$output_filestem.err")
696	if (-e "$output_filestem.err");
697	# insert the (modified) table of contents, if it exists.
698	if (-e "${output_filestem}_ToC.html") {
699	&util::mv("$output_filestem.html","$output_filestem.src");
700	my $open_failed=0;
701	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
702	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
703	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
704
705	if ($open_failed) {
706	close HTMLSRC;
707	close TOC;
708	close HTML;
709	&util::mv("$output_filestem.src","$output_filestem.html");
710	return 1;
711	}
712
713	# print out header info from src html.
714	while (defined($_ = <HTMLSRC>) && $_ =~ /\w/) {
715	print HTML "$_";
716	}
717
718	# print out table of contents, making links relative
719	<TOC>; <TOC>; # ignore first 2 lines
720	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
721	my $line;
722	while ($line=<TOC>) {
723	$line =~ s@</body></html>$@@ ; # only last line has this
724	# make link relative
725	$line =~ s@href=\"[^\#]+@href=\"@;
726	print HTML $line;
727	}
728	close TOC;
729
730	# rest of html src
731	while (<HTMLSRC>) {
732	print HTML $_;
733	}
734	close HTMLSRC;
735	close HTML;
736
737	&util::rm("${output_filestem}_ToC.html");
738	&util::rm("${output_filestem}.src");
739	}
740	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
741	return 1; # success
742	}
743
744	if (-e "$output_filestem.err") {
745	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
746	{
747	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
748	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
749	print FAILLOG " (rtf file might be too recent):\n";
750	open (ERRLOG, "$output_filestem.err");
751	while (<ERRLOG>) {print FAILLOG $_;}
752	close ERRLOG;
753	close FAILLOG;
754	}
755	&util::rm("$output_filestem.err");
756	}
757
758	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
759
760	return 0;
761	}
762
763
764	# Convert a pdf file to html with the pdftohtml command
765
766	sub pdf_to_html {
767	my ($dirname, $input_filename, $output_filestem) = @_;
768
769	$cmd = "";
770	if ($timeout) {$cmd = "ulimit -t $timeout;";}
771	$cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
772	$cmd .= " -c" if ($pdf_complex);
773	$cmd .= " -i" if ($pdf_ignore_images);
774	$cmd .= " -a" if ($pdf_allow_images_only);
775	$cmd .= " -hidden" unless ($pdf_nohidden);
776	$cmd .= " \"$input_filename\" \"$output_filestem\"";
777
778	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000) {
779	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
780	} else {
781	$cmd .= " > \"$output_filestem.err\"";
782	}
783
784	$!=0;
785
786	my $retval=system($cmd);
787	if ($retval!=0)
788	{
789	print STDERR "Error executing pdftohtml.pl";
790	if ($!) {print STDERR ": $!";}
791	print STDERR "\n";
792	}
793
794	# make sure the converter made something
795	if ($retval!=0 \|\| ! -s "$output_filestem.html")
796	{
797	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
798	# print out the converter's std err, if any
799	if (-s "$output_filestem.err") {
800	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
801	print STDERR "pdftohtml error log:\n";
802	while (<ERRLOG>) {
803	print STDERR "$_";
804	}
805	close ERRLOG;
806	}
807	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
808	if (-e "$output_filestem.err") {
809	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
810	{
811	open (ERRLOG, "$output_filestem.err");
812	while (<ERRLOG>) {print FAILLOG $_;}
813	close ERRLOG;
814	close FAILLOG;
815	}
816	&util::rm("$output_filestem.err");
817	}
818	return 0;
819	}
820
821	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
822	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
823	return 1;
824	}
825
826	# Convert a pdf file to various types of image with the convert command
827
828	sub pdf_to_img {
829	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
830
831	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
832	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
833	my $result = `identify 2>&1`;
834	if ($? == -1 \|\| $? == 256) { # Linux and Windows return different values for "program not found"
835	#ImageMagick is not installed, thus the convert utility is not available.
836	print STDERR "*** ImageMagick is not installed, the convert utility is not available\n";
837	return 0;
838	}
839	}
840
841	$cmd = "";
842	if ($timeout) {$cmd = "ulimit -t $timeout;";}
843	$output_type =~ s/.\_(.)/$1/i;
844	$cmd .= "perl -S pdftoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
845	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000) {
846	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
847	} else {
848	$cmd .= " > \"$output_filestem.err\"";
849	}
850
851	# don't include path on windows (to avoid having to play about
852	# with quoting when GSDLHOME might contain spaces) but assume
853	# that the PATH is set up correctly
854	$!=0;
855	my $retval=system($cmd);
856	if ($retval!=0)
857	{
858	print STDERR "Error executing pdftoimg.pl";
859	if ($!) {print STDERR ": $!";}
860	print STDERR "\n";
861	}
862
863	#make sure the converter made something
864	#if ($retval !=0) \|\| ! -s "$output_filestem")
865	if ($retval !=0)
866	{
867	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
868	#print out the converter's std err, if any
869	if (-s "$output_filestem.err") {
870	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
871	print STDERR "pdftoimg error log:\n";
872	while (<ERRLOG>) {
873	print STDERR "$_";
874	}
875	close ERRLOG;
876	}
877	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
878	if (-e "$output_filestem.err") {
879	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
880	{
881	open (ERRLOG, "$output_filestem.err");
882	while (<ERRLOG>) {print FAILLOG $_;}
883	close ERRLOG;
884	close FAILLOG;
885	}
886	&util::rm("$output_filestem.err");
887	}
888	return 0;
889	}
890	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
891	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
892	return 1;
893	}
894
895	# Convert a PDF file to text with the pdftotext command
896
897	sub pdf_to_text {
898	my ($dirname, $input_filename, $output_filestem) = @_;
899
900	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
901
902	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
903	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
904	} else {
905	$cmd .= " > \"$output_filestem.err\"";
906	}
907
908	if (system($cmd)!=0)
909	{
910	print STDERR "Error executing $cmd: $!\n";
911	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
912	}
913
914	# make sure there is some extracted text.
915	if (-e "$output_filestem.text") {
916	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
917	binmode(EXTR_TEXT); # just in case...
918	my $line="";
919	my $seen_text=0;
920	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
921	if ($line=~ /\w/) {$seen_text=1;}
922	}
923	close EXTR_TEXT;
924	if ($seen_text==0) { # no text was extracted
925	print STDERR "Error: pdftotext found no text\n";
926	&util::rm("$output_filestem.text");
927	}
928	}
929
930	# make sure the converter made something
931	if (! -s "$output_filestem.text")
932	{
933	# print out the converters std err, if any
934	if (-s "$output_filestem.err") {
935	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
936	print STDERR "pdftotext error log:\n";
937	while (<ERRLOG>) {
938	print STDERR "$_";
939	}
940	close ERRLOG;
941	}
942	# does this converter create a .out file?
943	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
944	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
945	if (-e "$output_filestem.err") {
946	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
947	{
948	open (ERRLOG,"$output_filestem.err");
949	while (<ERRLOG>) {print FAILLOG $_;}
950	close ERRLOG;
951	close FAILLOG;
952	}
953	&util::rm("$output_filestem.err");
954	}
955	return 0;
956	}
957	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
958	return 1;
959	}
960
961	# Convert a PostScript document to text
962	# note - just using "ps2ascii" isn't good enough, as it
963	# returns 0 for a postscript interpreter error. ps2ascii is just
964	# a wrapper to "gs" anyway, so we use that cmd here.
965
966	sub ps_to_text {
967	my ($input_filename, $output_filestem) = @_;
968
969	my $error = "";
970
971	# if we're on windows we'll fall straight through without attempting
972	# to use gs
973	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
974	$error = "Windows does not support gs";
975
976	} else {
977	my $cmd = "";
978	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
979	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
980	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
981	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
982	$cmd .= " 2> $output_filestem.err";
983	$!=0;
984
985	my $retcode=system($cmd);
986	$retcode = $? >> 8; # see man perlfunc - system for this...
987	# if system returns -1 \| 127 (couldn't start program), look at $! for message
988
989	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
990	elsif (! -e "$output_filestem.text") {
991	$error="did not create output file.\n";
992	}
993	else
994	{ # make sure the interpreter didn't get an error. It is technically
995	# possible for the actual text to start with this, but....
996	open PSOUT, "$output_filestem.text";
997	if (<PSOUT> =~ /^Error: (.*)/) {
998	$error="interpreter error - \"$1\"";
999	}
1000	close PSOUT;
1001	}
1002	}
1003
1004	if ($error ne "")
1005	{
1006	print STDERR "Warning: Error executing gs: $error\n";
1007	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1008
1009	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1010	{
1011	print FAILLOG "gs - $error\n";
1012	if (-e "$output_filestem.err") {
1013	open(ERRLOG, "$output_filestem.err");
1014	while (<ERRLOG>) {print FAILLOG $_;}
1015	close ERRLOG;
1016	}
1017	close FAILLOG;
1018	}
1019	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1020
1021
1022	# Fine then. We'll just do a lousy job by ourselves...
1023	# Based on 5-line regexp sed script found at:
1024	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1025	#
1026	print STDERR "Stripping text from postscript\n";
1027	my $errorcode=0;
1028	open (IN, "$input_filename")
1029	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1030	open (OUT, ">$output_filestem.text")
1031	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1032	if ($errorcode) {print STDERR "errors\n";return 0;}
1033
1034	my $text=""; # this is for whole .ps file...
1035	$text = join('', <IN>); # see man perlport, under "System Resources"
1036	close IN;
1037
1038	# Make sure this is a ps file...
1039	if ($text !~ /^%!/) {
1040	print STDERR "Bad postscript header: not '%!'\n";
1041	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1042	{
1043	print FAILLOG "Bad postscript header: not '%!'\n";
1044	close FAILLOG;
1045	}
1046	return 0;
1047	}
1048
1049	# if ps has Page data, then use it to delete all stuff before it.
1050	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1051
1052	# remove all leading non-data stuff
1053	$text =~ s/^.*?\(//s;
1054
1055	# remove all newline chars for easier processing
1056	$text =~ s/\n//g;
1057
1058	# Big assumption here - assume that if any co-ordinates are
1059	# given, then we are at the end of a sentence.
1060	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1061
1062	# special characters--
1063	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1064
1065	# ? ps text formatting (eg italics?) ?
1066	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1067	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1068	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1069	# default - remove the rest
1070	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1071
1072	# attempt to add whitespace between words...
1073	# this is based purely on observation, and may be completely wrong...
1074	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1075	# eg I notice "b(" is sometimes NOT a space if preceded by a
1076	# negative number.
1077	$text =~ s/\)\d+ ?b\(/\) \( /g;
1078
1079	# change quoted braces to brackets
1080	$text =~ s/([^\\])\\\(/$1\{/g;
1081	$text =~ s/([^\\])\\\)/$1\}/g ;
1082
1083	# remove everything that is not between braces
1084	$text =~ s/\)([^\(\)])+?\(//sg ;
1085
1086	# remove any Trailer eof stuff.
1087	$text =~ s/\)[^\)]*$//sg;
1088
1089	### ligatures have special characters...
1090	$text =~ s/\\013/ff/g;
1091	$text =~ s/\\014/fi/g;
1092	$text =~ s/\\015/fl/g;
1093	$text =~ s/\\016/ffi/g;
1094	$text =~ s/\\214/fi/g;
1095	$text =~ s/\\215/fl/g;
1096	$text =~ s/\\017/\n\* /g; # asterisk?
1097	$text =~ s/\\023/\023/g; # e acute ('e)
1098	$text =~ s/\\177/\252/g; # u"
1099	# $text =~ s/ ?? /\344/g; # a"
1100
1101	print OUT "$text";
1102	close OUT;
1103	}
1104	# wrap the text - use a minimum length. ie, first space after this length.
1105	my $wrap_length=72;
1106	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1107	open INFILE, "$output_filestem.text.tmp" \|\|
1108	die "Couldn't open file: $!";
1109	open OUTFILE, ">$output_filestem.text" \|\|
1110	die "Couldn't open file for writing: $!";
1111	my $line="";
1112	while ($line=<INFILE>) {
1113	while (length($line)>0) {
1114	if (length($line)>$wrap_length) {
1115	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1116	print OUTFILE "$1\n";
1117	} else {
1118	print OUTFILE "$line";
1119	$line="";
1120	}
1121	}
1122	}
1123	close INFILE;
1124	close OUTFILE;
1125	&util::rm("$output_filestem.text.tmp");
1126
1127	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1128	return 1;
1129	}
1130
1131
1132	# Convert a PS file to various types of image with the convert utility
1133	sub ps_to_img {
1134	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
1135
1136	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
1137	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
1138	my $result = `identify 2>&1`;
1139	if ($? == -1 \|\| $? == 256) { # Linux and Windows return different values for "program not found"
1140	#ImageMagick is not installed, thus the convert utility is not available.
1141	print STDERR "*** ImageMagick is not installed, the convert utility is not available\n";
1142	return 0;
1143	}
1144	}
1145
1146	$cmd = "";
1147	if ($timeout) {$cmd = "ulimit -t $timeout;";}
1148	$output_type =~ s/.\_(.)/$1/i;
1149	$cmd .= "perl -S pstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
1150	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000) {
1151	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1152	} else {
1153	$cmd .= " > \"$output_filestem.err\"";
1154	}
1155
1156	# don't include path on windows (to avoid having to play about
1157	# with quoting when GSDLHOME might contain spaces) but assume
1158	# that the PATH is set up correctly
1159	$!=0;
1160	my $retval=system($cmd);
1161	if ($retval!=0)
1162	{
1163	print STDERR "Error executing pstoimg.pl";
1164	if ($!) {print STDERR ": $!";}
1165	print STDERR "\n";
1166	}
1167
1168	#make sure the converter made something
1169	#if ($retval !=0) \|\| ! -s "$output_filestem")
1170	if ($retval !=0)
1171	{
1172	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1173	#print out the converter's std err, if any
1174	if (-s "$output_filestem.err") {
1175	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1176	print STDERR "pstoimg error log:\n";
1177	while (<ERRLOG>) {
1178	print STDERR "$_";
1179	}
1180	close ERRLOG;
1181	}
1182	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1183	if (-e "$output_filestem.err") {
1184	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1185	{
1186	open (ERRLOG, "$output_filestem.err");
1187	while (<ERRLOG>) {print FAILLOG $_;}
1188	close ERRLOG;
1189	close FAILLOG;
1190	}
1191	&util::rm("$output_filestem.err");
1192	}
1193	return 0;
1194	}
1195	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1196	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1197	return 1;
1198	}
1199
1200	# Convert any file to HTML with a crude perl implementation of the
1201	# UNIX strings command.
1202
1203	sub any_to_html {
1204	($input_filename, $output_filestem) = @_;
1205
1206	# First generate a text file
1207	return 0 unless (&any_to_text($input_filename, $output_filestem));
1208
1209	# create an HTML file from the text file
1210	open(TEXT, "<$output_filestem.text");
1211	open(HTML, ">$output_filestem.html");
1212
1213	print HTML "<html><head>\n";
1214	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1215	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1216	print HTML "</head><body>\n\n";
1217
1218	my $line;
1219	while ($line=<TEXT>) {
1220	$line =~ s/</</g;
1221	$line =~ s/>/>/g;
1222	if ($line =~ /^\s*$/) {
1223	print HTML "<p>";
1224	} else {
1225	print HTML "<br> ", $line;
1226	}
1227	}
1228	print HTML "\n</body></html>\n";
1229
1230	close HTML;
1231	close TEXT;
1232
1233	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1234	return 1;
1235	}
1236
1237	# Convert any file to TEXT with a crude perl implementation of the
1238	# UNIX strings command.
1239	# Note - this assumes ascii charsets :( (jrm21)
1240
1241	sub any_to_text {
1242	($input_filename, $output_filestem) = @_;
1243
1244	if (!$use_strings) {
1245	return 0;
1246	}
1247
1248	open(IN, "<$input_filename") \|\| return 0;
1249	binmode(IN);
1250	open(OUT, ">$output_filestem.text") \|\| return 0;
1251
1252	my ($line);
1253	my $output_line_count = 0;
1254	while (<IN>) {
1255	$line = $_;
1256
1257	# delete anything that isn't a printable character
1258	$line =~ s/[^\040-\176]+/\n/sg;
1259
1260	# delete any string less than 10 characters long
1261	$line =~ s/^.{0,9}$/\n/mg;
1262	while ($line =~ /^.{1,9}$/m) {
1263	$line =~ s/^.{0,9}$/\n/mg;
1264	$line =~ s/\n+/\n/sg;
1265	}
1266
1267	# remove extraneous whitespace
1268	$line =~ s/\n+/\n/gs;
1269	$line =~ s/^\n//gs;
1270
1271	# output whatever is left
1272	if ($line =~ /[^\n ]/) {
1273	print OUT $line;
1274	++$output_line_count;
1275	}
1276	}
1277
1278	close OUT;
1279	close IN;
1280
1281	if ($output_line_count) { # try to protect against binary only formats
1282	return 1;
1283	}
1284
1285	&util::rm("$output_filestem.text");
1286	return 0;
1287
1288	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: