Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 24608

Last change on this file since 24608 was 24608, checked in by ak19, 13 years ago
New debug output statement messes up sequence of error messages in PDF tutorial.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 35.7 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use strict;
52
53	use parsargv;
54	use util;
55	use Cwd;
56
57	# Are we running on WinNT or Win2000 (or later)?
58	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
59	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
60
61	my $use_strings;
62	my $pdf_complex;
63	my $pdf_nohidden;
64	my $pdf_zoom;
65	my $pdf_ignore_images;
66	my $pdf_allow_images_only;
67	my $windows_scripting;
68
69	sub print_usage
70	{
71	print STDERR "\n";
72	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
73	print STDERR " or text using third-party programs.\n\n";
74	print STDERR " usage: $0 [options] filename\n";
75	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
76	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
77	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
78	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
79	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
80	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
81	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
82	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
83	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
84	print STDERR "\t\tconverting PDF to HTML\n";
85	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
86	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
87	print STDERR "\t\t-pdf_complex is set\n";
88	exit(1);
89	}
90
91	my $faillogfile="";
92	my $timeout=0;
93	my $verbosity=0;
94
95	sub main
96	{
97	my (@ARGV) = @_;
98	my ($input_type,$output_type,$verbose);
99
100	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
101	# is in use or not
102	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
103	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
104	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
105	# Currently only have VBA for Word and PPT(but no XLS)
106	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
107
108	my $type_re = $default_type_re;
109
110	foreach my $a (@ARGV) {
111	if ($a =~ m/^windows_scripting$/i) {
112	$type_re = $enhanced_type_re;
113	}
114	}
115
116	# read command-line arguments
117	if (!parsargv::parse(\@ARGV,
118	"type/$type_re/", \$input_type,
119	'/errlog/.*/', \$faillogfile,
120	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
121	'timeout/\d+/0',\$timeout,
122	'verbose/\d+/0', \$verbose,
123	'windows_scripting',\$windows_scripting,
124	'use_strings', \$use_strings,
125	'pdf_complex', \$pdf_complex,
126	'pdf_ignore_images', \$pdf_ignore_images,
127	'pdf_allow_images_only', \$pdf_allow_images_only,
128	'pdf_nohidden', \$pdf_nohidden,
129	'pdf_zoom/\d+/2', \$pdf_zoom
130	))
131	{
132	print_usage();
133	}
134
135	$verbosity=$verbose if defined $verbose;
136
137	# Make sure the input file exists and can be opened for reading
138	if (scalar(@ARGV!=1)) {
139	print_usage();
140	}
141
142	my $input_filename = $ARGV[0];
143	if (!-r $input_filename) {
144	print STDERR "Error: unable to open $input_filename for reading\n";
145	exit(1);
146	}
147
148	# Deduce filenames
149	my ($tailname,$dirname,$suffix)
150	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
151	my $output_filestem = &util::filename_cat($dirname, "$tailname");
152
153	if ($input_type eq "")
154	{
155	$input_type = lc (substr($suffix,1,length($suffix)-1));
156	}
157
158	# Change to temporary working directory
159	my $stored_dir = cwd();
160	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
161
162	# Select convert utility
163	if (!defined $input_type) {
164	print STDERR "Error: No filename extension or input type defined\n";
165	exit(1);
166	}
167	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
168	print &convertDOC($input_filename, $output_filestem, $output_type);
169	print "\n";
170	}
171	elsif ($input_type eq "rtf") {
172	print &convertRTF($input_filename, $output_filestem, $output_type);
173	print "\n";
174	}
175	elsif ($input_type eq "pdf") {
176	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
177	print "\n";
178	}
179	elsif ($input_type eq "ps") {
180	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
181	print "\n";
182	}
183	elsif ($input_type =~ m/pptx?$/) {
184	print &convertPPT($input_filename, $output_filestem, $output_type);
185	print "\n";
186	}
187	elsif ($input_type =~ m/xlsx?$/) {
188	print &convertXLS($input_filename, $output_filestem, $output_type);
189	print "\n";
190	}
191	else {
192	print STDERR "Error: Unable to convert type '$input_type'\n";
193	exit(1);
194	}
195
196	# restore to original working directory
197	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
198
199	}
200
201	&main(@ARGV);
202
203
204
205	# Document-type conversion functions
206	#
207	# The following functions attempt to convert documents from their
208	# input type to the specified output type. If no output type was
209	# given, then they first attempt HTML, and then TEXT.
210	#
211	# Each returns the output type ("html" or "text") or "fail" if no
212	# conversion is possible.
213
214	# Convert a Microsoft word document
215
216	sub convertDOC {
217	my ($input_filename, $output_filestem, $output_type) = @_;
218
219	# Many .doc files are not in fact word documents!
220	my $realtype = &find_docfile_type($input_filename);
221
222	if ($realtype eq "word6" \|\| $realtype eq "word7"
223	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
224	return &convertWord678($input_filename, $output_filestem, $output_type);
225	} elsif ($realtype eq "rtf") {
226	return &convertRTF($input_filename, $output_filestem, $output_type);
227	} else {
228	return &convertAnything($input_filename, $output_filestem, $output_type);
229	}
230	}
231
232	# Convert a Microsoft word 6/7/8 document
233
234	sub convertWord678 {
235	my ($input_filename, $output_filestem, $output_type) = @_;
236
237	my $success = 0;
238	if (!$output_type \|\| ($output_type =~ m/html/i)){
239	if ($windows_scripting) {
240	$success = &native_doc_to_html($input_filename, $output_filestem);
241	}
242	else {
243	$success = &doc_to_html($input_filename, $output_filestem);
244	}
245	if ($success) {
246	return "html";
247	}
248	}
249	return &convertAnything($input_filename, $output_filestem, $output_type);
250	}
251
252
253	# Convert a Rich Text Format (RTF) file
254
255	sub convertRTF {
256	my ($input_filename, $output_filestem, $output_type) = @_;
257
258	my $success = 0;
259
260	# Attempt specialised conversion to HTML
261	if (!$output_type \|\| ($output_type =~ m/html/i)) {
262
263	if ($windows_scripting) {
264	$success = &native_doc_to_html($input_filename, $output_filestem);
265	}
266	else {
267	$success = &rtf_to_html($input_filename, $output_filestem);
268	}
269	if ($success) {
270	return "html";
271	}
272	}
273
274	# rtf is so ugly that's it's not worth running strings over.
275	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
276	# return &convertAnything($input_filename, $output_filestem, $output_type);
277	return "fail";
278	}
279
280
281	# Convert an unidentified file
282
283	sub convertAnything {
284	my ($input_filename, $output_filestem, $output_type) = @_;
285
286	my $success = 0;
287
288	# Attempt simple conversion to HTML
289	if (!$output_type \|\| ($output_type =~ m/html/i)) {
290	$success = &any_to_html($input_filename, $output_filestem);
291	if ($success) {
292	return "html";
293	}
294	}
295
296	# Convert to text
297	if (!$output_type \|\| ($output_type =~ m/text/i)) {
298	$success = &any_to_text($input_filename, $output_filestem);
299	if ($success) {
300	return "text";
301	}
302	}
303	return "fail";
304	}
305
306
307
308	# Convert an Adobe PDF document
309
310	sub convertPDF {
311	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
312
313	my $success = 0;
314	$output_type =~ s/.\-(.)/$1/i;
315	# Attempt coversion to Image
316	if ($output_type =~ m/jp?g\|gif\|png/i) {
317	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
318	if ($success){
319	return "item";
320	}
321	}
322
323	# Attempt conversion to HTML
324	if (!$output_type \|\| ($output_type =~ m/html/i)) {
325	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
326	if ($success) {
327	return "html";
328	}
329	}
330
331	# Attempt conversion to TEXT
332	if (!$output_type \|\| ($output_type =~ m/text/i)) {
333	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
334	if ($success) {
335	return "text";
336	}
337	}
338
339	return "fail";
340
341	}
342
343
344	# Convert an Adobe PostScript document
345
346	sub convertPS {
347	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
348
349	my $success = 0;
350	$output_type =~ s/.\-(.)/$1/i;
351	# Attempt coversion to Image
352	if ($output_type =~ m/jp?g\|gif\|png/i) {
353	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
354	if ($success){
355	return "item";
356	}
357	}
358
359	# Attempt conversion to TEXT
360	if (!$output_type \|\| ($output_type =~ m/text/i)) {
361	$success = &ps_to_text($input_filename, $output_filestem);
362	if ($success) {
363	return "text";
364	}
365	}
366	return "fail";
367	}
368
369
370	sub convertPPT {
371	my ($input_filename, $output_filestem, $output_type) = @_;
372	my $success = 0;
373
374	my $ppt_convert_type = "";
375
376	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
377	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
378	if ($output_type =~ m/gif/i) {
379	$ppt_convert_type = "-g";
380	} elsif ($output_type =~ m/jp?g/i){
381	$ppt_convert_type = "-j";
382	} elsif ($output_type =~ m/png/i){
383	$ppt_convert_type = "-p";
384	}
385	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
386	$ENV{'GSDLOS'}, "pptextract");
387	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
388
389	my $cmd = "";
390	if ($timeout) {$cmd = "ulimit -t $timeout;";}
391	# if the converting directory already exists
392	if (-d $output_filestem) {
393	print STDERR "**The conversion directory already exists\n";
394	return "item";
395	} else {
396	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
397	$cmd .= " 2>\"$output_filestem.err\""
398	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
399	if (system($cmd) !=0) {
400	print STDERR "Powerpoint VB Scripting convert failed\n";
401	} else {
402	return "item";
403	}
404	}
405	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
406	# Attempt conversion to HTML
407	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
408	# formulate the command
409	my $cmd = "";
410	my $full_perl_path = &util::get_perl_exec();
411	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
412	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
413	$cmd .= " 2>\"$output_filestem.err\""
414	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
415
416	# execute the command
417	$!=0;
418	if (system($cmd)!=0)
419	{
420	print STDERR "Powerpoint 95/97 converter failed $!\n";
421	} else {
422	return "html";
423	}
424	}
425
426	$success = &any_to_text($input_filename, $output_filestem);
427	if ($success) {
428	return "text";
429	}
430
431	return "fail";
432	}
433
434
435	sub convertXLS {
436	my ($input_filename, $output_filestem, $output_type) = @_;
437
438	my $success = 0;
439
440	# Attempt conversion to HTML
441	if (!$output_type \|\| ($output_type =~ m/html/i)) {
442	# formulate the command
443	my $cmd = "";
444	my $full_perl_path = &util::get_perl_exec();
445	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
446	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
447	$cmd .= " 2>\"$output_filestem.err\""
448	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
449
450
451	# execute the command
452	$!=0;
453	if (system($cmd)!=0)
454	{
455	print STDERR "Excel 95/97 converter failed $!\n";
456	} else {
457	return "html";
458	}
459	}
460
461	$success = &any_to_text($input_filename, $output_filestem);
462	if ($success) {
463	return "text";
464	}
465
466	return "fail";
467	}
468
469
470
471	# Find the real type of a .doc file
472	#
473	# We seem to have a lot of files with a .doc extension that are .rtf
474	# files or Word 5 files. This function attempts to tell the difference.
475	sub find_docfile_type {
476	my ($input_filename) = @_;
477
478	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
479	return "docx";
480	}
481
482	open(CHK, "<$input_filename");
483	binmode(CHK);
484	my $line = "";
485	my $first = 1;
486
487	while (<CHK>) {
488
489	$line = $_;
490
491	if ($first) {
492	# check to see if this is an rtf file
493	if ($line =~ m/^\{\\rtf/) {
494	close(CHK);
495	return "rtf";
496	}
497	$first = 0;
498	}
499
500	# is this is a word 6/7/8 document?
501	if ($line =~ m/Word\.Document\.([678])/) {
502	close(CHK);
503
504	return "word$1";
505	}
506
507	}
508
509	return "unknown";
510	}
511
512
513	# Specific type-to-type conversions
514	#
515	# Each of the following functions attempts to convert a document from
516	# a specific format to another. If they succeed they return 1 and leave
517	# the output document(s) in the appropriate place; if they fail they
518	# return 0 and delete any working files.
519
520
521	# Attempt to convert a word document to html with the wv program
522	sub doc_to_html {
523	my ($input_filename, $output_filestem) = @_;
524
525	my $wvware_status = 0;
526
527	# need to ensure that the path to perl is quoted (in case there's spaces in it)
528	my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
529
530	# print STDERR "***** wvware launch cmd = $launch_cmd\n";
531
532	$wvware_status = system($launch_cmd)/256;
533	return $wvware_status;
534	}
535
536	# Attempt to convert a word document to html with the word2html scripting program
537	sub native_doc_to_html {
538	my ($input_filename, $output_filestem) = @_;
539
540	# build up the path to the doc-to-html conversion tool we're going to use
541	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
542
543	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
544	# if windows scripting with docx input, use new VBscript to get the local Word install (if
545	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
546
547	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
548	# else script launch fails when there are error msgs
549	$vbScript = &util::filename_cat($vbScript, "docx2html.vbs");
550	$vbScript = "CScript //Nologo \"$vbScript\""; # launche with CScript for error output in STDERR
551	# //Nologo flag avoids Microsoft's opening/logo msgs
552	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
553	print STDERR " This may take some time. Please wait...\n";
554	}
555	else { # old doc versions. use the usual VB executable word2html for the
556	# conversion. Doesn't need full path, since bin\windows is on PATH
557	$vbScript = "word2html"; #$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
558	}
559	}
560	else { # not windows
561	$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
562	}
563
564	if (-e "$output_filestem.html") {
565	print STDERR " The conversion file:\n";
566	print STDERR " $output_filestem.html\n";
567	print STDERR " ... already exists. Skipping\n";
568	return 1;
569	}
570
571	my $cmd = "";
572	if ($timeout) {$cmd = "ulimit -t $timeout;";}
573	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
574	#$cmd .= "$vbScript $input_filename $output_filestem.html";
575	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
576
577	# redirecting STDERR
578
579	$cmd .= " 2> \"$output_filestem.err\""
580	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
581	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
582
583	# execute the command
584	$!=0;
585	if (system($cmd)!=0)
586	{
587	print STDERR "Error executing $vbScript converter:$!\n";
588	if (-s "$output_filestem.err") {
589	open (ERRFILE, "<$output_filestem.err");
590
591	my $write_to_fail_log=0;
592	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
593	{$write_to_fail_log=1;}
594
595	my $line;
596	while ($line=<ERRFILE>) {
597	if ($line =~ m/\w/) {
598	print STDERR "$line";
599	print FAILLOG "$line" if ($write_to_fail_log);
600	}
601	if ($line !~ m/startup error/) {next;}
602	print STDERR " (given an invalid .DOC file?)\n";
603	print FAILLOG " (given an invalid .DOC file?)\n"
604	if ($write_to_fail_log);
605
606	} # while ERRFILE
607	close FAILLOG if ($write_to_fail_log);
608	}
609	return 0; # we can try any_to_text
610	}
611
612	# Was the conversion successful?
613	if (-s "$output_filestem.html") {
614	open(TMP, "$output_filestem.html");
615	my $line = <TMP>;
616	close(TMP);
617	if ($line && $line =~ m/html/i) {
618	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
619	return 1;
620	}
621	}
622
623	# If here, an error of some sort occurred
624	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
625	if (-e "$output_filestem.err") {
626	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
627	open (ERRLOG,"$output_filestem.err");
628	while (<ERRLOG>) {print FAILLOG $_;}
629	close FAILLOG;
630	close ERRLOG;
631	}
632	&util::rm("$output_filestem.err");
633	}
634	return 0;
635	}
636
637	# Attempt to convert an RTF document to html with rtftohtml
638	sub rtf_to_html {
639	my ($input_filename, $output_filestem) = @_;
640
641	# formulate the command
642	my $cmd = "";
643	if ($timeout) {$cmd = "ulimit -t $timeout;";}
644	$cmd .= "rtftohtml";
645	#$cmd .= "rtf-converter";
646
647	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
648
649	$cmd .= " 2>\"$output_filestem.err\""
650	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
651
652
653	# execute the command
654	$!=0;
655	if (system($cmd)!=0)
656	{
657	print STDERR "Error executing rtf converter $!\n";
658	# don't currently bother printing out error log...
659	# keep going, in case it still created an HTML file...
660	}
661
662	# Was the conversion successful?
663	my $was_successful=0;
664	if (-s "$output_filestem.html") {
665	# make sure we have some content other than header
666	open (HTML, "$output_filestem.html"); # what to do if fail?
667	my $line;
668	my $past_header=0;
669	while ($line=<HTML>) {
670
671	if ($past_header == 0) {
672	if ($line =~ m/<body>/) {$past_header=1;}
673	next;
674	}
675
676	$line =~ s/<[^>]+>//g;
677	if ($line =~ m/\w/ && $past_header) { # we found some content...
678	$was_successful=1;
679	last;
680	}
681	}
682	close HTML;
683	}
684
685	if ($was_successful) {
686	&util::rm("$output_filestem.err")
687	if (-e "$output_filestem.err");
688	# insert the (modified) table of contents, if it exists.
689	if (-e "${output_filestem}_ToC.html") {
690	&util::mv("$output_filestem.html","$output_filestem.src");
691	my $open_failed=0;
692	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
693	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
694	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
695
696	if ($open_failed) {
697	close HTMLSRC;
698	close TOC;
699	close HTML;
700	&util::mv("$output_filestem.src","$output_filestem.html");
701	return 1;
702	}
703
704	# print out header info from src html.
705	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
706	print HTML "$_";
707	}
708
709	# print out table of contents, making links relative
710	<TOC>; <TOC>; # ignore first 2 lines
711	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
712	my $line;
713	while ($line=<TOC>) {
714	$line =~ s@</body></html>$@@i ; # only last line has this
715	# make link relative
716	$line =~ s@href=\"[^\#]+@href=\"@i;
717	print HTML $line;
718	}
719	close TOC;
720
721	# rest of html src
722	while (<HTMLSRC>) {
723	print HTML $_;
724	}
725	close HTMLSRC;
726	close HTML;
727
728	&util::rm("${output_filestem}_ToC.html");
729	&util::rm("${output_filestem}.src");
730	}
731	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
732	return 1; # success
733	}
734
735	if (-e "$output_filestem.err") {
736	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
737	{
738	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
739	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
740	print FAILLOG " (rtf file might be too recent):\n";
741	open (ERRLOG, "$output_filestem.err");
742	while (<ERRLOG>) {print FAILLOG $_;}
743	close ERRLOG;
744	close FAILLOG;
745	}
746	&util::rm("$output_filestem.err");
747	}
748
749	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
750
751	return 0;
752	}
753
754
755	# Convert a pdf file to html with the pdftohtml command
756
757	sub pdf_to_html {
758	my ($dirname, $input_filename, $output_filestem) = @_;
759
760	my $cmd = "";
761	if ($timeout) {$cmd = "ulimit -t $timeout;";}
762	my $full_perl_path = &util::get_perl_exec();
763	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
764	$cmd .= " -c" if ($pdf_complex);
765	$cmd .= " -i" if ($pdf_ignore_images);
766	$cmd .= " -a" if ($pdf_allow_images_only);
767	$cmd .= " -hidden" unless ($pdf_nohidden);
768	$cmd .= " \"$input_filename\" \"$output_filestem\"";
769
770	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
771	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
772	} else {
773	$cmd .= " > \"$output_filestem.err\"";
774	}
775
776	$!=0;
777
778	my $retval=system($cmd);
779	if ($retval!=0)
780	{
781	print STDERR "Error executing pdftohtml.pl";
782	if ($!) {print STDERR ": $!";}
783	print STDERR "\n";
784	}
785
786	# make sure the converter made something
787	if ($retval!=0 \|\| ! -s "$output_filestem.html")
788	{
789	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
790	# print out the converter's std err, if any
791	if (-s "$output_filestem.err") {
792	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
793	print STDERR "pdftohtml error log:\n";
794	while (<ERRLOG>) {
795	print STDERR "$_";
796	}
797	close ERRLOG;
798	}
799	#print STDERR "***********output filestem $output_filestem.html\n";
800	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
801	if (-e "$output_filestem.err") {
802	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
803	{
804	open (ERRLOG, "$output_filestem.err");
805	while (<ERRLOG>) {print FAILLOG $_;}
806	close ERRLOG;
807	close FAILLOG;
808	}
809	&util::rm("$output_filestem.err");
810	}
811	return 0;
812	}
813
814	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
815	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
816	return 1;
817	}
818
819	# Convert a pdf file to various types of image with the convert command
820
821	sub pdfps_to_img {
822	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
823
824	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
825	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
826	my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
827	my $result = `$imagick_cmd identify 2>&1`;
828
829	# Linux and Windows return different values for "program not found".
830	# Linux returns -1 and Windows 256 for "program not found". But once they're
831	# converted to signed values, it will be -1 for Linux and 1 for Windows.
832	# Whenever we test for return values other than 0, shift by 8 and perform
833	# unsigned to signed status conversion on $? to get expected range of return vals
834	# Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
835	# and then exits on that, by the time we get here, we need to do it again
836	my $status = $?;
837	$status >>= 8;
838	$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
839	if ($status == -1 \|\| $status == 1) { #if ($status == -1 \|\| $status == 256) {
840	#ImageMagick is not installed, thus the convert utility is not available.
841	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
842	return 0;
843	}
844	}
845
846	my $cmd = "";
847	if ($timeout) {$cmd = "ulimit -t $timeout;";}
848	$output_type =~ s/.\_(.)/$1/i;
849	my $full_perl_path = &util::get_perl_exec();
850	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
851	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
852	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
853	} else {
854	$cmd .= " > \"$output_filestem.err\"";
855	}
856
857	# don't include path on windows (to avoid having to play about
858	# with quoting when GSDLHOME might contain spaces) but assume
859	# that the PATH is set up correctly
860	$!=0;
861	my $retval=system($cmd);
862	if ($retval!=0)
863	{
864	print STDERR "Error executing pdftoimg.pl";
865	if ($!) {print STDERR ": $!";}
866	print STDERR "\n";
867	}
868
869	#make sure the converter made something
870	#if ($retval !=0) \|\| ! -s "$output_filestem")
871	if ($retval !=0)
872	{
873	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
874	#print out the converter's std err, if any
875	if (-s "$output_filestem.err") {
876	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
877	print STDERR "pdfpstoimg error log:\n";
878	while (<ERRLOG>) {
879	print STDERR "$_";
880	}
881	close ERRLOG;
882	}
883	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
884	if (-e "$output_filestem.err") {
885	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
886	{
887	open (ERRLOG, "$output_filestem.err");
888	while (<ERRLOG>) {print FAILLOG $_;}
889	close ERRLOG;
890	close FAILLOG;
891	}
892	&util::rm("$output_filestem.err");
893	}
894	return 0;
895	}
896	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
897	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
898	return 1;
899	}
900
901	# Convert a PDF file to text with the pdftotext command
902
903	sub pdf_to_text {
904	my ($dirname, $input_filename, $output_filestem) = @_;
905
906	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
907
908	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
909	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
910	} else {
911	$cmd .= " > \"$output_filestem.err\"";
912	}
913
914	if (system($cmd)!=0)
915	{
916	print STDERR "Error executing $cmd: $!\n";
917	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
918	}
919
920	# make sure there is some extracted text.
921	if (-e "$output_filestem.text") {
922	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
923	binmode(EXTR_TEXT); # just in case...
924	my $line="";
925	my $seen_text=0;
926	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
927	if ($line=~ m/\w/) {$seen_text=1;}
928	}
929	close EXTR_TEXT;
930	if ($seen_text==0) { # no text was extracted
931	print STDERR "Error: pdftotext found no text\n";
932	&util::rm("$output_filestem.text");
933	}
934	}
935
936	# make sure the converter made something
937	if (! -s "$output_filestem.text")
938	{
939	# print out the converters std err, if any
940	if (-s "$output_filestem.err") {
941	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
942	print STDERR "pdftotext error log:\n";
943	while (<ERRLOG>) {
944	print STDERR "$_";
945	}
946	close ERRLOG;
947	}
948	# does this converter create a .out file?
949	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
950	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
951	if (-e "$output_filestem.err") {
952	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
953	{
954	open (ERRLOG,"$output_filestem.err");
955	while (<ERRLOG>) {print FAILLOG $_;}
956	close ERRLOG;
957	close FAILLOG;
958	}
959	&util::rm("$output_filestem.err");
960	}
961	return 0;
962	}
963	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
964	return 1;
965	}
966
967	# Convert a PostScript document to text
968	# note - just using "ps2ascii" isn't good enough, as it
969	# returns 0 for a postscript interpreter error. ps2ascii is just
970	# a wrapper to "gs" anyway, so we use that cmd here.
971
972	sub ps_to_text {
973	my ($input_filename, $output_filestem) = @_;
974
975	my $error = "";
976
977	# if we're on windows we'll fall straight through without attempting
978	# to use gs
979	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
980	$error = "Windows does not support gs";
981
982	} else {
983	my $cmd = "";
984	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
985	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
986	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
987	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
988	$cmd .= " 2> $output_filestem.err";
989	$!=0;
990
991	my $retcode=system($cmd);
992	$retcode = $? >> 8; # see man perlfunc - system for this...
993	# if system returns -1 \| 127 (couldn't start program), look at $! for message
994
995	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
996	elsif (! -e "$output_filestem.text") {
997	$error="did not create output file.\n";
998	}
999	else
1000	{ # make sure the interpreter didn't get an error. It is technically
1001	# possible for the actual text to start with this, but....
1002	open PSOUT, "$output_filestem.text";
1003	if (<PSOUT> =~ m/^Error: (.*)/) {
1004	$error="interpreter error - \"$1\"";
1005	}
1006	close PSOUT;
1007	}
1008	}
1009
1010	if ($error ne "")
1011	{
1012	print STDERR "Warning: Error executing gs: $error\n";
1013	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1014
1015	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1016	{
1017	print FAILLOG "gs - $error\n";
1018	if (-e "$output_filestem.err") {
1019	open(ERRLOG, "$output_filestem.err");
1020	while (<ERRLOG>) {print FAILLOG $_;}
1021	close ERRLOG;
1022	}
1023	close FAILLOG;
1024	}
1025	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1026
1027
1028	# Fine then. We'll just do a lousy job by ourselves...
1029	# Based on 5-line regexp sed script found at:
1030	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1031	#
1032	print STDERR "Stripping text from postscript\n";
1033	my $errorcode=0;
1034	open (IN, "$input_filename")
1035	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1036	open (OUT, ">$output_filestem.text")
1037	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1038	if ($errorcode) {print STDERR "errors\n";return 0;}
1039
1040	my $text=""; # this is for whole .ps file...
1041	$text = join('', <IN>); # see man perlport, under "System Resources"
1042	close IN;
1043
1044	# Make sure this is a ps file...
1045	if ($text !~ m/^%!/) {
1046	print STDERR "Bad postscript header: not '%!'\n";
1047	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1048	{
1049	print FAILLOG "Bad postscript header: not '%!'\n";
1050	close FAILLOG;
1051	}
1052	return 0;
1053	}
1054
1055	# if ps has Page data, then use it to delete all stuff before it.
1056	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1057
1058	# remove all leading non-data stuff
1059	$text =~ s/^.*?\(//s;
1060
1061	# remove all newline chars for easier processing
1062	$text =~ s/\n//g;
1063
1064	# Big assumption here - assume that if any co-ordinates are
1065	# given, then we are at the end of a sentence.
1066	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1067
1068	# special characters--
1069	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1070
1071	# ? ps text formatting (eg italics?) ?
1072	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1073	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1074	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1075	# default - remove the rest
1076	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1077
1078	# attempt to add whitespace between words...
1079	# this is based purely on observation, and may be completely wrong...
1080	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1081	# eg I notice "b(" is sometimes NOT a space if preceded by a
1082	# negative number.
1083	$text =~ s/\)\d+ ?b\(/\) \( /g;
1084
1085	# change quoted braces to brackets
1086	$text =~ s/([^\\])\\\(/$1\{/g;
1087	$text =~ s/([^\\])\\\)/$1\}/g ;
1088
1089	# remove everything that is not between braces
1090	$text =~ s/\)([^\(\)])+?\(//sg ;
1091
1092	# remove any Trailer eof stuff.
1093	$text =~ s/\)[^\)]*$//sg;
1094
1095	### ligatures have special characters...
1096	$text =~ s/\\013/ff/g;
1097	$text =~ s/\\014/fi/g;
1098	$text =~ s/\\015/fl/g;
1099	$text =~ s/\\016/ffi/g;
1100	$text =~ s/\\214/fi/g;
1101	$text =~ s/\\215/fl/g;
1102	$text =~ s/\\017/\n\* /g; # asterisk?
1103	$text =~ s/\\023/\023/g; # e acute ('e)
1104	$text =~ s/\\177/\252/g; # u"
1105	# $text =~ s/ ?? /\344/g; # a"
1106
1107	print OUT "$text";
1108	close OUT;
1109	}
1110	# wrap the text - use a minimum length. ie, first space after this length.
1111	my $wrap_length=72;
1112	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1113	open INFILE, "$output_filestem.text.tmp" \|\|
1114	die "Couldn't open file: $!";
1115	open OUTFILE, ">$output_filestem.text" \|\|
1116	die "Couldn't open file for writing: $!";
1117	my $line="";
1118	while ($line=<INFILE>) {
1119	while (length($line)>0) {
1120	if (length($line)>$wrap_length) {
1121	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1122	print OUTFILE "$1\n";
1123	} else {
1124	print OUTFILE "$line";
1125	$line="";
1126	}
1127	}
1128	}
1129	close INFILE;
1130	close OUTFILE;
1131	&util::rm("$output_filestem.text.tmp");
1132
1133	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1134	return 1;
1135	}
1136
1137
1138	# Convert any file to HTML with a crude perl implementation of the
1139	# UNIX strings command.
1140
1141	sub any_to_html {
1142	my ($input_filename, $output_filestem) = @_;
1143
1144	# First generate a text file
1145	return 0 unless (&any_to_text($input_filename, $output_filestem));
1146
1147	# create an HTML file from the text file
1148	open(TEXT, "<$output_filestem.text");
1149	open(HTML, ">$output_filestem.html");
1150
1151	print HTML "<html><head>\n";
1152	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1153	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1154	print HTML "</head><body>\n\n";
1155
1156	my $line;
1157	while ($line=<TEXT>) {
1158	$line =~ s/</</g;
1159	$line =~ s/>/>/g;
1160	if ($line =~ m/^\s*$/) {
1161	print HTML "<p>";
1162	} else {
1163	print HTML "<br> ", $line;
1164	}
1165	}
1166	print HTML "\n</body></html>\n";
1167
1168	close HTML;
1169	close TEXT;
1170
1171	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1172	return 1;
1173	}
1174
1175	# Convert any file to TEXT with a crude perl implementation of the
1176	# UNIX strings command.
1177	# Note - this assumes ascii charsets :( (jrm21)
1178
1179	sub any_to_text {
1180	my ($input_filename, $output_filestem) = @_;
1181
1182	if (!$use_strings) {
1183	return 0;
1184	}
1185
1186	print STDERR "\n** In any to text**\n\n";
1187	open(IN, "<$input_filename") \|\| return 0;
1188	binmode(IN);
1189	open(OUT, ">$output_filestem.text") \|\| return 0;
1190
1191	my ($line);
1192	my $output_line_count = 0;
1193	while (<IN>) {
1194	$line = $_;
1195
1196	# delete anything that isn't a printable character
1197	$line =~ s/[^\040-\176]+/\n/sg;
1198
1199	# delete any string less than 10 characters long
1200	$line =~ s/^.{0,9}$/\n/mg;
1201	while ($line =~ m/^.{1,9}$/m) {
1202	$line =~ s/^.{0,9}$/\n/mg;
1203	$line =~ s/\n+/\n/sg;
1204	}
1205
1206	# remove extraneous whitespace
1207	$line =~ s/\n+/\n/gs;
1208	$line =~ s/^\n//gs;
1209
1210	# output whatever is left
1211	if ($line =~ m/[^\n ]/) {
1212	print OUT $line;
1213	++$output_line_count;
1214	}
1215	}
1216
1217	close OUT;
1218	close IN;
1219
1220	if ($output_line_count) { # try to protect against binary only formats
1221	return 1;
1222	}
1223
1224	&util::rm("$output_filestem.text");
1225	return 0;
1226
1227	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: