Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 25798

Last change on this file since 25798 was 25798, checked in by ak19, 12 years ago
Fixing bug where on CentOS Linux a call to identify returns 256 and is not the error code for when Imagemagick is not found (on Linux -1 is the error code). Oddly the same issue did not occur on the Ubuntu.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 35.9 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use strict;
52
53	use parsargv;
54	use util;
55	use Cwd;
56
57	# Are we running on WinNT or Win2000 (or later)?
58	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
59	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
60
61	my $use_strings;
62	my $pdf_complex;
63	my $pdf_nohidden;
64	my $pdf_zoom;
65	my $pdf_ignore_images;
66	my $pdf_allow_images_only;
67	my $windows_scripting;
68
69	sub print_usage
70	{
71	print STDERR "\n";
72	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
73	print STDERR " or text using third-party programs.\n\n";
74	print STDERR " usage: $0 [options] filename\n";
75	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
76	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
77	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
78	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
79	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
80	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
81	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
82	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
83	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
84	print STDERR "\t\tconverting PDF to HTML\n";
85	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
86	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
87	print STDERR "\t\t-pdf_complex is set\n";
88	exit(1);
89	}
90
91	my $faillogfile="";
92	my $timeout=0;
93	my $verbosity=0;
94
95	sub main
96	{
97	my (@ARGV) = @_;
98	my ($input_type,$output_type,$verbose);
99
100	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
101	# is in use or not
102	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
103	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
104	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
105	# Currently only have VBA for Word and PPT(but no XLS)
106	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
107
108	my $type_re = $default_type_re;
109
110	foreach my $a (@ARGV) {
111	if ($a =~ m/^windows_scripting$/i) {
112	$type_re = $enhanced_type_re;
113	}
114	}
115
116	# read command-line arguments
117	if (!parsargv::parse(\@ARGV,
118	"type/$type_re/", \$input_type,
119	'/errlog/.*/', \$faillogfile,
120	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
121	'timeout/\d+/0',\$timeout,
122	'verbose/\d+/0', \$verbose,
123	'windows_scripting',\$windows_scripting,
124	'use_strings', \$use_strings,
125	'pdf_complex', \$pdf_complex,
126	'pdf_ignore_images', \$pdf_ignore_images,
127	'pdf_allow_images_only', \$pdf_allow_images_only,
128	'pdf_nohidden', \$pdf_nohidden,
129	'pdf_zoom/\d+/2', \$pdf_zoom
130	))
131	{
132	print_usage();
133	}
134
135	$verbosity=$verbose if defined $verbose;
136
137	# Make sure the input file exists and can be opened for reading
138	if (scalar(@ARGV!=1)) {
139	print_usage();
140	}
141
142	my $input_filename = $ARGV[0];
143	if (!-r $input_filename) {
144	print STDERR "Error: unable to open $input_filename for reading\n";
145	exit(1);
146	}
147
148	# Deduce filenames
149	my ($tailname,$dirname,$suffix)
150	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
151	my $output_filestem = &util::filename_cat($dirname, "$tailname");
152
153	if ($input_type eq "")
154	{
155	$input_type = lc (substr($suffix,1,length($suffix)-1));
156	}
157
158	# Change to temporary working directory
159	my $stored_dir = cwd();
160	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
161
162	# Select convert utility
163	if (!defined $input_type) {
164	print STDERR "Error: No filename extension or input type defined\n";
165	exit(1);
166	}
167	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
168	print &convertDOC($input_filename, $output_filestem, $output_type);
169	print "\n";
170	}
171	elsif ($input_type eq "rtf") {
172	print &convertRTF($input_filename, $output_filestem, $output_type);
173	print "\n";
174	}
175	elsif ($input_type eq "pdf") {
176	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
177	print "\n";
178	}
179	elsif ($input_type eq "ps") {
180	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
181	print "\n";
182	}
183	elsif ($input_type =~ m/pptx?$/) {
184	print &convertPPT($input_filename, $output_filestem, $output_type);
185	print "\n";
186	}
187	elsif ($input_type =~ m/xlsx?$/) {
188	print &convertXLS($input_filename, $output_filestem, $output_type);
189	print "\n";
190	}
191	else {
192	print STDERR "Error: Unable to convert type '$input_type'\n";
193	exit(1);
194	}
195
196	# restore to original working directory
197	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
198
199	}
200
201	&main(@ARGV);
202
203
204
205	# Document-type conversion functions
206	#
207	# The following functions attempt to convert documents from their
208	# input type to the specified output type. If no output type was
209	# given, then they first attempt HTML, and then TEXT.
210	#
211	# Each returns the output type ("html" or "text") or "fail" if no
212	# conversion is possible.
213
214	# Convert a Microsoft word document
215
216	sub convertDOC {
217	my ($input_filename, $output_filestem, $output_type) = @_;
218
219	# Many .doc files are not in fact word documents!
220	my $realtype = &find_docfile_type($input_filename);
221
222	if ($realtype eq "word6" \|\| $realtype eq "word7"
223	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
224	return &convertWord678($input_filename, $output_filestem, $output_type);
225	} elsif ($realtype eq "rtf") {
226	return &convertRTF($input_filename, $output_filestem, $output_type);
227	} else {
228	return &convertAnything($input_filename, $output_filestem, $output_type);
229	}
230	}
231
232	# Convert a Microsoft word 6/7/8 document
233
234	sub convertWord678 {
235	my ($input_filename, $output_filestem, $output_type) = @_;
236
237	my $success = 0;
238	if (!$output_type \|\| ($output_type =~ m/html/i)){
239	if ($windows_scripting) {
240	$success = &native_doc_to_html($input_filename, $output_filestem);
241	}
242	else {
243	$success = &doc_to_html($input_filename, $output_filestem);
244	}
245	if ($success) {
246	return "html";
247	}
248	}
249	return &convertAnything($input_filename, $output_filestem, $output_type);
250	}
251
252
253	# Convert a Rich Text Format (RTF) file
254
255	sub convertRTF {
256	my ($input_filename, $output_filestem, $output_type) = @_;
257
258	my $success = 0;
259
260	# Attempt specialised conversion to HTML
261	if (!$output_type \|\| ($output_type =~ m/html/i)) {
262
263	if ($windows_scripting) {
264	$success = &native_doc_to_html($input_filename, $output_filestem);
265	}
266	else {
267	$success = &rtf_to_html($input_filename, $output_filestem);
268	}
269	if ($success) {
270	return "html";
271	}
272	}
273
274	# rtf is so ugly that's it's not worth running strings over.
275	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
276	# return &convertAnything($input_filename, $output_filestem, $output_type);
277	return "fail";
278	}
279
280
281	# Convert an unidentified file
282
283	sub convertAnything {
284	my ($input_filename, $output_filestem, $output_type) = @_;
285
286	my $success = 0;
287
288	# Attempt simple conversion to HTML
289	if (!$output_type \|\| ($output_type =~ m/html/i)) {
290	$success = &any_to_html($input_filename, $output_filestem);
291	if ($success) {
292	return "html";
293	}
294	}
295
296	# Convert to text
297	if (!$output_type \|\| ($output_type =~ m/text/i)) {
298	$success = &any_to_text($input_filename, $output_filestem);
299	if ($success) {
300	return "text";
301	}
302	}
303	return "fail";
304	}
305
306
307
308	# Convert an Adobe PDF document
309
310	sub convertPDF {
311	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
312
313	my $success = 0;
314	$output_type =~ s/.\-(.)/$1/i;
315	# Attempt coversion to Image
316	if ($output_type =~ m/jp?g\|gif\|png/i) {
317	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
318	if ($success){
319	return "item";
320	}
321	}
322
323	# Attempt conversion to HTML
324	if (!$output_type \|\| ($output_type =~ m/html/i)) {
325	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
326	if ($success) {
327	return "html";
328	}
329	}
330
331	# Attempt conversion to TEXT
332	if (!$output_type \|\| ($output_type =~ m/text/i)) {
333	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
334	if ($success) {
335	return "text";
336	}
337	}
338
339	return "fail";
340
341	}
342
343
344	# Convert an Adobe PostScript document
345
346	sub convertPS {
347	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
348
349	my $success = 0;
350	$output_type =~ s/.\-(.)/$1/i;
351	# Attempt coversion to Image
352	if ($output_type =~ m/jp?g\|gif\|png/i) {
353	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
354	if ($success){
355	return "item";
356	}
357	}
358
359	# Attempt conversion to TEXT
360	if (!$output_type \|\| ($output_type =~ m/text/i)) {
361	$success = &ps_to_text($input_filename, $output_filestem);
362	if ($success) {
363	return "text";
364	}
365	}
366	return "fail";
367	}
368
369
370	sub convertPPT {
371	my ($input_filename, $output_filestem, $output_type) = @_;
372	my $success = 0;
373
374	my $ppt_convert_type = "";
375
376	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
377	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
378	if ($output_type =~ m/gif/i) {
379	$ppt_convert_type = "-g";
380	} elsif ($output_type =~ m/jp?g/i){
381	$ppt_convert_type = "-j";
382	} elsif ($output_type =~ m/png/i){
383	$ppt_convert_type = "-p";
384	}
385	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
386	$ENV{'GSDLOS'}, "pptextract");
387	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
388
389	my $cmd = "";
390	if ($timeout) {$cmd = "ulimit -t $timeout;";}
391	# if the converting directory already exists
392	if (-d $output_filestem) {
393	print STDERR "**The conversion directory already exists\n";
394	return "item";
395	} else {
396	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
397	$cmd .= " 2>\"$output_filestem.err\""
398	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
399	if (system($cmd) !=0) {
400	print STDERR "Powerpoint VB Scripting convert failed\n";
401	} else {
402	return "item";
403	}
404	}
405	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
406	# Attempt conversion to HTML
407	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
408	# formulate the command
409	my $cmd = "";
410	my $full_perl_path = &util::get_perl_exec();
411	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
412	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
413	$cmd .= " 2>\"$output_filestem.err\""
414	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
415
416	# execute the command
417	$!=0;
418	if (system($cmd)!=0)
419	{
420	print STDERR "Powerpoint 95/97 converter failed $!\n";
421	} else {
422	return "html";
423	}
424	}
425
426	$success = &any_to_text($input_filename, $output_filestem);
427	if ($success) {
428	return "text";
429	}
430
431	return "fail";
432	}
433
434
435	sub convertXLS {
436	my ($input_filename, $output_filestem, $output_type) = @_;
437
438	my $success = 0;
439
440	# Attempt conversion to HTML
441	if (!$output_type \|\| ($output_type =~ m/html/i)) {
442	# formulate the command
443	my $cmd = "";
444	my $full_perl_path = &util::get_perl_exec();
445	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
446	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
447	$cmd .= " 2>\"$output_filestem.err\""
448	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
449
450
451	# execute the command
452	$!=0;
453	if (system($cmd)!=0)
454	{
455	print STDERR "Excel 95/97 converter failed $!\n";
456	} else {
457	return "html";
458	}
459	}
460
461	$success = &any_to_text($input_filename, $output_filestem);
462	if ($success) {
463	return "text";
464	}
465
466	return "fail";
467	}
468
469
470
471	# Find the real type of a .doc file
472	#
473	# We seem to have a lot of files with a .doc extension that are .rtf
474	# files or Word 5 files. This function attempts to tell the difference.
475	sub find_docfile_type {
476	my ($input_filename) = @_;
477
478	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
479	return "docx";
480	}
481
482	open(CHK, "<$input_filename");
483	binmode(CHK);
484	my $line = "";
485	my $first = 1;
486
487	while (<CHK>) {
488
489	$line = $_;
490
491	if ($first) {
492	# check to see if this is an rtf file
493	if ($line =~ m/^\{\\rtf/) {
494	close(CHK);
495	return "rtf";
496	}
497	$first = 0;
498	}
499
500	# is this is a word 6/7/8 document?
501	if ($line =~ m/Word\.Document\.([678])/) {
502	close(CHK);
503
504	return "word$1";
505	}
506
507	}
508
509	return "unknown";
510	}
511
512
513	# Specific type-to-type conversions
514	#
515	# Each of the following functions attempts to convert a document from
516	# a specific format to another. If they succeed they return 1 and leave
517	# the output document(s) in the appropriate place; if they fail they
518	# return 0 and delete any working files.
519
520
521	# Attempt to convert a word document to html with the wv program
522	sub doc_to_html {
523	my ($input_filename, $output_filestem) = @_;
524
525	my $wvware_status = 0;
526
527	# need to ensure that the path to perl is quoted (in case there's spaces in it)
528	my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
529
530	# print STDERR "***** wvware launch cmd = $launch_cmd\n";
531
532	$wvware_status = system($launch_cmd)/256;
533	return $wvware_status;
534	}
535
536	# Attempt to convert a word document to html with the word2html scripting program
537	sub native_doc_to_html {
538	my ($input_filename, $output_filestem) = @_;
539
540	# build up the path to the doc-to-html conversion tool we're going to use
541	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
542
543	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
544	# if windows scripting with docx input, use new VBscript to get the local Word install (if
545	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
546
547	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
548	# else script launch fails when there are error msgs
549	$vbScript = &util::filename_cat($vbScript, "docx2html.vbs");
550	$vbScript = "CScript //Nologo \"$vbScript\""; # launche with CScript for error output in STDERR
551	# //Nologo flag avoids Microsoft's opening/logo msgs
552	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
553	print STDERR " This may take some time. Please wait...\n";
554	}
555	else { # old doc versions. use the usual VB executable word2html for the
556	# conversion. Doesn't need full path, since bin\windows is on PATH
557	$vbScript = "word2html"; #$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
558	}
559	}
560	else { # not windows
561	$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
562	}
563
564	if (-e "$output_filestem.html") {
565	print STDERR " The conversion file:\n";
566	print STDERR " $output_filestem.html\n";
567	print STDERR " ... already exists. Skipping\n";
568	return 1;
569	}
570
571	my $cmd = "";
572	if ($timeout) {$cmd = "ulimit -t $timeout;";}
573	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
574	#$cmd .= "$vbScript $input_filename $output_filestem.html";
575	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
576
577	# redirecting STDERR
578
579	$cmd .= " 2> \"$output_filestem.err\""
580	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
581	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
582
583	# execute the command
584	$!=0;
585	if (system($cmd)!=0)
586	{
587	print STDERR "Error executing $vbScript converter:$!\n";
588	if (-s "$output_filestem.err") {
589	open (ERRFILE, "<$output_filestem.err");
590
591	my $write_to_fail_log=0;
592	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
593	{$write_to_fail_log=1;}
594
595	my $line;
596	while ($line=<ERRFILE>) {
597	if ($line =~ m/\w/) {
598	print STDERR "$line";
599	print FAILLOG "$line" if ($write_to_fail_log);
600	}
601	if ($line !~ m/startup error/) {next;}
602	print STDERR " (given an invalid .DOC file?)\n";
603	print FAILLOG " (given an invalid .DOC file?)\n"
604	if ($write_to_fail_log);
605
606	} # while ERRFILE
607	close FAILLOG if ($write_to_fail_log);
608	}
609	return 0; # we can try any_to_text
610	}
611
612	# Was the conversion successful?
613	if (-s "$output_filestem.html") {
614	open(TMP, "$output_filestem.html");
615	my $line = <TMP>;
616	close(TMP);
617	if ($line && $line =~ m/html/i) {
618	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
619	return 1;
620	}
621	}
622
623	# If here, an error of some sort occurred
624	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
625	if (-e "$output_filestem.err") {
626	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
627	open (ERRLOG,"$output_filestem.err");
628	while (<ERRLOG>) {print FAILLOG $_;}
629	close FAILLOG;
630	close ERRLOG;
631	}
632	&util::rm("$output_filestem.err");
633	}
634	return 0;
635	}
636
637	# Attempt to convert an RTF document to html with rtftohtml
638	sub rtf_to_html {
639	my ($input_filename, $output_filestem) = @_;
640
641	# formulate the command
642	my $cmd = "";
643	if ($timeout) {$cmd = "ulimit -t $timeout;";}
644	$cmd .= "rtftohtml";
645	#$cmd .= "rtf-converter";
646
647	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
648
649	$cmd .= " 2>\"$output_filestem.err\""
650	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
651
652
653	# execute the command
654	$!=0;
655	if (system($cmd)!=0)
656	{
657	print STDERR "Error executing rtf converter $!\n";
658	# don't currently bother printing out error log...
659	# keep going, in case it still created an HTML file...
660	}
661
662	# Was the conversion successful?
663	my $was_successful=0;
664	if (-s "$output_filestem.html") {
665	# make sure we have some content other than header
666	open (HTML, "$output_filestem.html"); # what to do if fail?
667	my $line;
668	my $past_header=0;
669	while ($line=<HTML>) {
670
671	if ($past_header == 0) {
672	if ($line =~ m/<body>/) {$past_header=1;}
673	next;
674	}
675
676	$line =~ s/<[^>]+>//g;
677	if ($line =~ m/\w/ && $past_header) { # we found some content...
678	$was_successful=1;
679	last;
680	}
681	}
682	close HTML;
683	}
684
685	if ($was_successful) {
686	&util::rm("$output_filestem.err")
687	if (-e "$output_filestem.err");
688	# insert the (modified) table of contents, if it exists.
689	if (-e "${output_filestem}_ToC.html") {
690	&util::mv("$output_filestem.html","$output_filestem.src");
691	my $open_failed=0;
692	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
693	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
694	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
695
696	if ($open_failed) {
697	close HTMLSRC;
698	close TOC;
699	close HTML;
700	&util::mv("$output_filestem.src","$output_filestem.html");
701	return 1;
702	}
703
704	# print out header info from src html.
705	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
706	print HTML "$_";
707	}
708
709	# print out table of contents, making links relative
710	<TOC>; <TOC>; # ignore first 2 lines
711	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
712	my $line;
713	while ($line=<TOC>) {
714	$line =~ s@</body></html>$@@i ; # only last line has this
715	# make link relative
716	$line =~ s@href=\"[^\#]+@href=\"@i;
717	print HTML $line;
718	}
719	close TOC;
720
721	# rest of html src
722	while (<HTMLSRC>) {
723	print HTML $_;
724	}
725	close HTMLSRC;
726	close HTML;
727
728	&util::rm("${output_filestem}_ToC.html");
729	&util::rm("${output_filestem}.src");
730	}
731	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
732	return 1; # success
733	}
734
735	if (-e "$output_filestem.err") {
736	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
737	{
738	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
739	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
740	print FAILLOG " (rtf file might be too recent):\n";
741	open (ERRLOG, "$output_filestem.err");
742	while (<ERRLOG>) {print FAILLOG $_;}
743	close ERRLOG;
744	close FAILLOG;
745	}
746	&util::rm("$output_filestem.err");
747	}
748
749	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
750
751	return 0;
752	}
753
754
755	# Convert a pdf file to html with the pdftohtml command
756
757	sub pdf_to_html {
758	my ($dirname, $input_filename, $output_filestem) = @_;
759
760	my $cmd = "";
761	if ($timeout) {$cmd = "ulimit -t $timeout;";}
762	my $full_perl_path = &util::get_perl_exec();
763	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
764	$cmd .= " -c" if ($pdf_complex);
765	$cmd .= " -i" if ($pdf_ignore_images);
766	$cmd .= " -a" if ($pdf_allow_images_only);
767	$cmd .= " -hidden" unless ($pdf_nohidden);
768	$cmd .= " \"$input_filename\" \"$output_filestem\"";
769
770	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
771	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
772	} else {
773	$cmd .= " > \"$output_filestem.err\"";
774	}
775
776	$!=0;
777
778	my $retval=system($cmd);
779	if ($retval!=0)
780	{
781	print STDERR "Error executing pdftohtml.pl";
782	if ($!) {print STDERR ": $!";}
783	print STDERR "\n";
784	}
785
786	# make sure the converter made something
787	if ($retval!=0 \|\| ! -s "$output_filestem.html")
788	{
789	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
790	# print out the converter's std err, if any
791	if (-s "$output_filestem.err") {
792	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
793	print STDERR "pdftohtml error log:\n";
794	while (<ERRLOG>) {
795	print STDERR "$_";
796	}
797	close ERRLOG;
798	}
799	#print STDERR "***********output filestem $output_filestem.html\n";
800	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
801	if (-e "$output_filestem.err") {
802	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
803	{
804	open (ERRLOG, "$output_filestem.err");
805	while (<ERRLOG>) {print FAILLOG $_;}
806	close ERRLOG;
807	close FAILLOG;
808	}
809	&util::rm("$output_filestem.err");
810	}
811	return 0;
812	}
813
814	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
815	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
816	return 1;
817	}
818
819	# Convert a pdf file to various types of image with the convert command
820
821	sub pdfps_to_img {
822	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
823
824	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
825	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
826	my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
827	$imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
828	my $result = `$imagick_cmd identify 2>&1`;
829
830	# Linux and Windows return different values for "program not found".
831	# Linux returns -1 and Windows 256 for "program not found". But once they're
832	# converted to signed values, it will be -1 for Linux and 1 for Windows.
833	# Whenever we test for return values other than 0, shift by 8 and perform
834	# unsigned to signed status conversion on $? to get expected range of return vals
835	# Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
836	# and then exits on that, by the time we get here, we need to do it again
837	my $status = $?;
838	$status >>= 8;
839	$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
840	if (($ENV{'GSDLOS'} ne "windows" && $status == -1) \|\| ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
841	# if ($status == -1 \|\| $status == 1) #if ($status == -1 \|\| $status == 256) {
842	#ImageMagick is not installed, thus the convert utility is not available.
843	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
844	return 0;
845	}
846	}
847
848	my $cmd = "";
849	if ($timeout) {$cmd = "ulimit -t $timeout;";}
850	$output_type =~ s/.\_(.)/$1/i;
851	my $full_perl_path = &util::get_perl_exec();
852	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
853	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
854	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
855	} else {
856	$cmd .= " > \"$output_filestem.err\"";
857	}
858
859	# don't include path on windows (to avoid having to play about
860	# with quoting when GSDLHOME might contain spaces) but assume
861	# that the PATH is set up correctly
862	$!=0;
863	my $retval=system($cmd);
864	if ($retval!=0)
865	{
866	print STDERR "Error executing pdftoimg.pl";
867	if ($!) {print STDERR ": $!";}
868	print STDERR "\n";
869	}
870
871	#make sure the converter made something
872	#if ($retval !=0) \|\| ! -s "$output_filestem")
873	if ($retval !=0)
874	{
875	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
876	#print out the converter's std err, if any
877	if (-s "$output_filestem.err") {
878	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
879	print STDERR "pdfpstoimg error log:\n";
880	while (<ERRLOG>) {
881	print STDERR "$_";
882	}
883	close ERRLOG;
884	}
885	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
886	if (-e "$output_filestem.err") {
887	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
888	{
889	open (ERRLOG, "$output_filestem.err");
890	while (<ERRLOG>) {print FAILLOG $_;}
891	close ERRLOG;
892	close FAILLOG;
893	}
894	&util::rm("$output_filestem.err");
895	}
896	return 0;
897	}
898	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
899	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
900	return 1;
901	}
902
903	# Convert a PDF file to text with the pdftotext command
904
905	sub pdf_to_text {
906	my ($dirname, $input_filename, $output_filestem) = @_;
907
908	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
909
910	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
911	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
912	} else {
913	$cmd .= " > \"$output_filestem.err\"";
914	}
915
916	if (system($cmd)!=0)
917	{
918	print STDERR "Error executing $cmd: $!\n";
919	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
920	}
921
922	# make sure there is some extracted text.
923	if (-e "$output_filestem.text") {
924	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
925	binmode(EXTR_TEXT); # just in case...
926	my $line="";
927	my $seen_text=0;
928	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
929	if ($line=~ m/\w/) {$seen_text=1;}
930	}
931	close EXTR_TEXT;
932	if ($seen_text==0) { # no text was extracted
933	print STDERR "Error: pdftotext found no text\n";
934	&util::rm("$output_filestem.text");
935	}
936	}
937
938	# make sure the converter made something
939	if (! -s "$output_filestem.text")
940	{
941	# print out the converters std err, if any
942	if (-s "$output_filestem.err") {
943	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
944	print STDERR "pdftotext error log:\n";
945	while (<ERRLOG>) {
946	print STDERR "$_";
947	}
948	close ERRLOG;
949	}
950	# does this converter create a .out file?
951	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
952	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
953	if (-e "$output_filestem.err") {
954	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
955	{
956	open (ERRLOG,"$output_filestem.err");
957	while (<ERRLOG>) {print FAILLOG $_;}
958	close ERRLOG;
959	close FAILLOG;
960	}
961	&util::rm("$output_filestem.err");
962	}
963	return 0;
964	}
965	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
966	return 1;
967	}
968
969	# Convert a PostScript document to text
970	# note - just using "ps2ascii" isn't good enough, as it
971	# returns 0 for a postscript interpreter error. ps2ascii is just
972	# a wrapper to "gs" anyway, so we use that cmd here.
973
974	sub ps_to_text {
975	my ($input_filename, $output_filestem) = @_;
976
977	my $error = "";
978
979	# if we're on windows we'll fall straight through without attempting
980	# to use gs
981	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
982	$error = "Windows does not support gs";
983
984	} else {
985	my $cmd = "";
986	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
987	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
988	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
989	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
990	$cmd .= " 2> $output_filestem.err";
991	$!=0;
992
993	my $retcode=system($cmd);
994	$retcode = $? >> 8; # see man perlfunc - system for this...
995	# if system returns -1 \| 127 (couldn't start program), look at $! for message
996
997	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
998	elsif (! -e "$output_filestem.text") {
999	$error="did not create output file.\n";
1000	}
1001	else
1002	{ # make sure the interpreter didn't get an error. It is technically
1003	# possible for the actual text to start with this, but....
1004	open PSOUT, "$output_filestem.text";
1005	if (<PSOUT> =~ m/^Error: (.*)/) {
1006	$error="interpreter error - \"$1\"";
1007	}
1008	close PSOUT;
1009	}
1010	}
1011
1012	if ($error ne "")
1013	{
1014	print STDERR "Warning: Error executing gs: $error\n";
1015	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1016
1017	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1018	{
1019	print FAILLOG "gs - $error\n";
1020	if (-e "$output_filestem.err") {
1021	open(ERRLOG, "$output_filestem.err");
1022	while (<ERRLOG>) {print FAILLOG $_;}
1023	close ERRLOG;
1024	}
1025	close FAILLOG;
1026	}
1027	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1028
1029
1030	# Fine then. We'll just do a lousy job by ourselves...
1031	# Based on 5-line regexp sed script found at:
1032	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1033	#
1034	print STDERR "Stripping text from postscript\n";
1035	my $errorcode=0;
1036	open (IN, "$input_filename")
1037	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1038	open (OUT, ">$output_filestem.text")
1039	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1040	if ($errorcode) {print STDERR "errors\n";return 0;}
1041
1042	my $text=""; # this is for whole .ps file...
1043	$text = join('', <IN>); # see man perlport, under "System Resources"
1044	close IN;
1045
1046	# Make sure this is a ps file...
1047	if ($text !~ m/^%!/) {
1048	print STDERR "Bad postscript header: not '%!'\n";
1049	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1050	{
1051	print FAILLOG "Bad postscript header: not '%!'\n";
1052	close FAILLOG;
1053	}
1054	return 0;
1055	}
1056
1057	# if ps has Page data, then use it to delete all stuff before it.
1058	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1059
1060	# remove all leading non-data stuff
1061	$text =~ s/^.*?\(//s;
1062
1063	# remove all newline chars for easier processing
1064	$text =~ s/\n//g;
1065
1066	# Big assumption here - assume that if any co-ordinates are
1067	# given, then we are at the end of a sentence.
1068	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1069
1070	# special characters--
1071	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1072
1073	# ? ps text formatting (eg italics?) ?
1074	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1075	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1076	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1077	# default - remove the rest
1078	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1079
1080	# attempt to add whitespace between words...
1081	# this is based purely on observation, and may be completely wrong...
1082	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1083	# eg I notice "b(" is sometimes NOT a space if preceded by a
1084	# negative number.
1085	$text =~ s/\)\d+ ?b\(/\) \( /g;
1086
1087	# change quoted braces to brackets
1088	$text =~ s/([^\\])\\\(/$1\{/g;
1089	$text =~ s/([^\\])\\\)/$1\}/g ;
1090
1091	# remove everything that is not between braces
1092	$text =~ s/\)([^\(\)])+?\(//sg ;
1093
1094	# remove any Trailer eof stuff.
1095	$text =~ s/\)[^\)]*$//sg;
1096
1097	### ligatures have special characters...
1098	$text =~ s/\\013/ff/g;
1099	$text =~ s/\\014/fi/g;
1100	$text =~ s/\\015/fl/g;
1101	$text =~ s/\\016/ffi/g;
1102	$text =~ s/\\214/fi/g;
1103	$text =~ s/\\215/fl/g;
1104	$text =~ s/\\017/\n\* /g; # asterisk?
1105	$text =~ s/\\023/\023/g; # e acute ('e)
1106	$text =~ s/\\177/\252/g; # u"
1107	# $text =~ s/ ?? /\344/g; # a"
1108
1109	print OUT "$text";
1110	close OUT;
1111	}
1112	# wrap the text - use a minimum length. ie, first space after this length.
1113	my $wrap_length=72;
1114	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1115	open INFILE, "$output_filestem.text.tmp" \|\|
1116	die "Couldn't open file: $!";
1117	open OUTFILE, ">$output_filestem.text" \|\|
1118	die "Couldn't open file for writing: $!";
1119	my $line="";
1120	while ($line=<INFILE>) {
1121	while (length($line)>0) {
1122	if (length($line)>$wrap_length) {
1123	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1124	print OUTFILE "$1\n";
1125	} else {
1126	print OUTFILE "$line";
1127	$line="";
1128	}
1129	}
1130	}
1131	close INFILE;
1132	close OUTFILE;
1133	&util::rm("$output_filestem.text.tmp");
1134
1135	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1136	return 1;
1137	}
1138
1139
1140	# Convert any file to HTML with a crude perl implementation of the
1141	# UNIX strings command.
1142
1143	sub any_to_html {
1144	my ($input_filename, $output_filestem) = @_;
1145
1146	# First generate a text file
1147	return 0 unless (&any_to_text($input_filename, $output_filestem));
1148
1149	# create an HTML file from the text file
1150	open(TEXT, "<$output_filestem.text");
1151	open(HTML, ">$output_filestem.html");
1152
1153	print HTML "<html><head>\n";
1154	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1155	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1156	print HTML "</head><body>\n\n";
1157
1158	my $line;
1159	while ($line=<TEXT>) {
1160	$line =~ s/</</g;
1161	$line =~ s/>/>/g;
1162	if ($line =~ m/^\s*$/) {
1163	print HTML "<p>";
1164	} else {
1165	print HTML "<br> ", $line;
1166	}
1167	}
1168	print HTML "\n</body></html>\n";
1169
1170	close HTML;
1171	close TEXT;
1172
1173	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1174	return 1;
1175	}
1176
1177	# Convert any file to TEXT with a crude perl implementation of the
1178	# UNIX strings command.
1179	# Note - this assumes ascii charsets :( (jrm21)
1180
1181	sub any_to_text {
1182	my ($input_filename, $output_filestem) = @_;
1183
1184	if (!$use_strings) {
1185	return 0;
1186	}
1187
1188	print STDERR "\n** In any to text**\n\n";
1189	open(IN, "<$input_filename") \|\| return 0;
1190	binmode(IN);
1191	open(OUT, ">$output_filestem.text") \|\| return 0;
1192
1193	my ($line);
1194	my $output_line_count = 0;
1195	while (<IN>) {
1196	$line = $_;
1197
1198	# delete anything that isn't a printable character
1199	$line =~ s/[^\040-\176]+/\n/sg;
1200
1201	# delete any string less than 10 characters long
1202	$line =~ s/^.{0,9}$/\n/mg;
1203	while ($line =~ m/^.{1,9}$/m) {
1204	$line =~ s/^.{0,9}$/\n/mg;
1205	$line =~ s/\n+/\n/sg;
1206	}
1207
1208	# remove extraneous whitespace
1209	$line =~ s/\n+/\n/gs;
1210	$line =~ s/^\n//gs;
1211
1212	# output whatever is left
1213	if ($line =~ m/[^\n ]/) {
1214	print OUT $line;
1215	++$output_line_count;
1216	}
1217	}
1218
1219	close OUT;
1220	close IN;
1221
1222	if ($output_line_count) { # try to protect against binary only formats
1223	return 1;
1224	}
1225
1226	&util::rm("$output_filestem.text");
1227	return 0;
1228
1229	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: