Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 24375

Last change on this file since 24375 was 24375, checked in by ak19, 13 years ago
Added in verbosity option when launching wvware.pl, so that an unnecessary message can be suppressed at lower verbosity levels.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 35.0 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use strict;
52
53	use parsargv;
54	use util;
55	use Cwd;
56
57	# Are we running on WinNT or Win2000 (or later)?
58	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
59	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
60
61	my $use_strings;
62	my $pdf_complex;
63	my $pdf_nohidden;
64	my $pdf_zoom;
65	my $pdf_ignore_images;
66	my $pdf_allow_images_only;
67	my $windows_scripting;
68
69	sub print_usage
70	{
71	print STDERR "\n";
72	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
73	print STDERR " or text using third-party programs.\n\n";
74	print STDERR " usage: $0 [options] filename\n";
75	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
76	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
77	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
78	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
79	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
80	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
81	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
82	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
83	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
84	print STDERR "\t\tconverting PDF to HTML\n";
85	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
86	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
87	print STDERR "\t\t-pdf_complex is set\n";
88	exit(1);
89	}
90
91	my $faillogfile="";
92	my $timeout=0;
93	my $verbosity=0;
94
95	sub main
96	{
97	my (@ARGV) = @_;
98	my ($input_type,$output_type,$verbose);
99
100	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
101	# is in use or not
102	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
103	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
104	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
105	# Currently only have VBA for Word and PPT(but no XLS)
106	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
107
108	my $type_re = $default_type_re;
109
110	foreach my $a (@ARGV) {
111	if ($a =~ m/^windows_scripting$/i) {
112	$type_re = $enhanced_type_re;
113	}
114	}
115
116	# read command-line arguments
117	if (!parsargv::parse(\@ARGV,
118	"type/$type_re/", \$input_type,
119	'/errlog/.*/', \$faillogfile,
120	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
121	'timeout/\d+/0',\$timeout,
122	'verbose/\d+/0', \$verbose,
123	'windows_scripting',\$windows_scripting,
124	'use_strings', \$use_strings,
125	'pdf_complex', \$pdf_complex,
126	'pdf_ignore_images', \$pdf_ignore_images,
127	'pdf_allow_images_only', \$pdf_allow_images_only,
128	'pdf_nohidden', \$pdf_nohidden,
129	'pdf_zoom/\d+/2', \$pdf_zoom
130	))
131	{
132	print_usage();
133	}
134
135	$verbosity=$verbose if defined $verbose;
136
137	# Make sure the input file exists and can be opened for reading
138	if (scalar(@ARGV!=1)) {
139	print_usage();
140	}
141
142	my $input_filename = $ARGV[0];
143	if (!-r $input_filename) {
144	print STDERR "Error: unable to open $input_filename for reading\n";
145	exit(1);
146	}
147
148	# Deduce filenames
149	my ($tailname,$dirname,$suffix)
150	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
151	my $output_filestem = &util::filename_cat($dirname, "$tailname");
152
153	if ($input_type eq "")
154	{
155	$input_type = lc (substr($suffix,1,length($suffix)-1));
156	}
157
158	# Change to temporary working directory
159	my $stored_dir = cwd();
160	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
161
162	# Select convert utility
163	if (!defined $input_type) {
164	print STDERR "Error: No filename extension or input type defined\n";
165	exit(1);
166	}
167	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
168	print &convertDOC($input_filename, $output_filestem, $output_type);
169	print "\n";
170	}
171	elsif ($input_type eq "rtf") {
172	print &convertRTF($input_filename, $output_filestem, $output_type);
173	print "\n";
174	}
175	elsif ($input_type eq "pdf") {
176	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
177	print "\n";
178	}
179	elsif ($input_type eq "ps") {
180	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
181	print "\n";
182	}
183	elsif ($input_type =~ m/pptx?$/) {
184	print &convertPPT($input_filename, $output_filestem, $output_type);
185	print "\n";
186	}
187	elsif ($input_type =~ m/xlsx?$/) {
188	print &convertXLS($input_filename, $output_filestem, $output_type);
189	print "\n";
190	}
191	else {
192	print STDERR "Error: Unable to convert type '$input_type'\n";
193	exit(1);
194	}
195
196	# restore to original working directory
197	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
198
199	}
200
201	&main(@ARGV);
202
203
204
205	# Document-type conversion functions
206	#
207	# The following functions attempt to convert documents from their
208	# input type to the specified output type. If no output type was
209	# given, then they first attempt HTML, and then TEXT.
210	#
211	# Each returns the output type ("html" or "text") or "fail" if no
212	# conversion is possible.
213
214	# Convert a Microsoft word document
215
216	sub convertDOC {
217	my ($input_filename, $output_filestem, $output_type) = @_;
218
219	# Many .doc files are not in fact word documents!
220	my $realtype = &find_docfile_type($input_filename);
221
222	if ($realtype eq "word6" \|\| $realtype eq "word7"
223	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
224	return &convertWord678($input_filename, $output_filestem, $output_type);
225	} elsif ($realtype eq "rtf") {
226	return &convertRTF($input_filename, $output_filestem, $output_type);
227	} else {
228	return &convertAnything($input_filename, $output_filestem, $output_type);
229	}
230	}
231
232	# Convert a Microsoft word 6/7/8 document
233
234	sub convertWord678 {
235	my ($input_filename, $output_filestem, $output_type) = @_;
236
237	my $success = 0;
238	if (!$output_type \|\| ($output_type =~ m/html/i)){
239	if ($windows_scripting) {
240	$success = &native_doc_to_html($input_filename, $output_filestem);
241	}
242	else {
243	$success = &doc_to_html($input_filename, $output_filestem);
244	}
245	if ($success) {
246	return "html";
247	}
248	}
249	return &convertAnything($input_filename, $output_filestem, $output_type);
250	}
251
252
253	# Convert a Rich Text Format (RTF) file
254
255	sub convertRTF {
256	my ($input_filename, $output_filestem, $output_type) = @_;
257
258	my $success = 0;
259
260	# Attempt specialised conversion to HTML
261	if (!$output_type \|\| ($output_type =~ m/html/i)) {
262
263	if ($windows_scripting) {
264	$success = &native_doc_to_html($input_filename, $output_filestem);
265	}
266	else {
267	$success = &rtf_to_html($input_filename, $output_filestem);
268	}
269	if ($success) {
270	return "html";
271	}
272	}
273
274	# rtf is so ugly that's it's not worth running strings over.
275	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
276	# return &convertAnything($input_filename, $output_filestem, $output_type);
277	return "fail";
278	}
279
280
281	# Convert an unidentified file
282
283	sub convertAnything {
284	my ($input_filename, $output_filestem, $output_type) = @_;
285
286	my $success = 0;
287
288	# Attempt simple conversion to HTML
289	if (!$output_type \|\| ($output_type =~ m/html/i)) {
290	$success = &any_to_html($input_filename, $output_filestem);
291	if ($success) {
292	return "html";
293	}
294	}
295
296	# Convert to text
297	if (!$output_type \|\| ($output_type =~ m/text/i)) {
298	$success = &any_to_text($input_filename, $output_filestem);
299	if ($success) {
300	return "text";
301	}
302	}
303	return "fail";
304	}
305
306
307
308	# Convert an Adobe PDF document
309
310	sub convertPDF {
311	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
312
313	my $success = 0;
314	$output_type =~ s/.\-(.)/$1/i;
315	# Attempt coversion to Image
316	if ($output_type =~ m/jp?g\|gif\|png/i) {
317	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
318	if ($success){
319	return "item";
320	}
321	}
322
323	# Attempt conversion to HTML
324	if (!$output_type \|\| ($output_type =~ m/html/i)) {
325	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
326	if ($success) {
327	return "html";
328	}
329	}
330
331	# Attempt conversion to TEXT
332	if (!$output_type \|\| ($output_type =~ m/text/i)) {
333	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
334	if ($success) {
335	return "text";
336	}
337	}
338
339	return "fail";
340
341	}
342
343
344	# Convert an Adobe PostScript document
345
346	sub convertPS {
347	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
348
349	my $success = 0;
350	$output_type =~ s/.\-(.)/$1/i;
351	# Attempt coversion to Image
352	if ($output_type =~ m/jp?g\|gif\|png/i) {
353	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
354	if ($success){
355	return "item";
356	}
357	}
358
359	# Attempt conversion to TEXT
360	if (!$output_type \|\| ($output_type =~ m/text/i)) {
361	$success = &ps_to_text($input_filename, $output_filestem);
362	if ($success) {
363	return "text";
364	}
365	}
366	return "fail";
367	}
368
369
370	sub convertPPT {
371	my ($input_filename, $output_filestem, $output_type) = @_;
372	my $success = 0;
373
374	my $ppt_convert_type = "";
375
376	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
377	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
378	if ($output_type =~ m/gif/i) {
379	$ppt_convert_type = "-g";
380	} elsif ($output_type =~ m/jp?g/i){
381	$ppt_convert_type = "-j";
382	} elsif ($output_type =~ m/png/i){
383	$ppt_convert_type = "-p";
384	}
385	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
386	$ENV{'GSDLOS'}, "pptextract");
387	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
388
389	my $cmd = "";
390	if ($timeout) {$cmd = "ulimit -t $timeout;";}
391	# if the converting directory already exists
392	if (-d $output_filestem) {
393	print STDERR "**The conversion directory already exists\n";
394	return "item";
395	} else {
396	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
397	$cmd .= " 2>\"$output_filestem.err\""
398	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
399	if (system($cmd) !=0) {
400	print STDERR "Powerpoint VB Scripting convert failed\n";
401	} else {
402	return "item";
403	}
404	}
405	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
406	# Attempt conversion to HTML
407	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
408	# formulate the command
409	my $cmd = "";
410	my $full_perl_path = &util::get_perl_exec();
411	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
412	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
413	$cmd .= " 2>\"$output_filestem.err\""
414	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
415
416	# execute the command
417	$!=0;
418	if (system($cmd)!=0)
419	{
420	print STDERR "Powerpoint 95/97 converter failed $!\n";
421	} else {
422	return "html";
423	}
424	}
425
426	$success = &any_to_text($input_filename, $output_filestem);
427	if ($success) {
428	return "text";
429	}
430
431	return "fail";
432	}
433
434
435	sub convertXLS {
436	my ($input_filename, $output_filestem, $output_type) = @_;
437
438	my $success = 0;
439
440	# Attempt conversion to HTML
441	if (!$output_type \|\| ($output_type =~ m/html/i)) {
442	# formulate the command
443	my $cmd = "";
444	my $full_perl_path = &util::get_perl_exec();
445	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
446	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
447	$cmd .= " 2>\"$output_filestem.err\""
448	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
449
450
451	# execute the command
452	$!=0;
453	if (system($cmd)!=0)
454	{
455	print STDERR "Excel 95/97 converter failed $!\n";
456	} else {
457	return "html";
458	}
459	}
460
461	$success = &any_to_text($input_filename, $output_filestem);
462	if ($success) {
463	return "text";
464	}
465
466	return "fail";
467	}
468
469
470
471	# Find the real type of a .doc file
472	#
473	# We seem to have a lot of files with a .doc extension that are .rtf
474	# files or Word 5 files. This function attempts to tell the difference.
475	sub find_docfile_type {
476	my ($input_filename) = @_;
477
478	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
479	return "docx";
480	}
481
482	open(CHK, "<$input_filename");
483	binmode(CHK);
484	my $line = "";
485	my $first = 1;
486
487	while (<CHK>) {
488
489	$line = $_;
490
491	if ($first) {
492	# check to see if this is an rtf file
493	if ($line =~ m/^\{\\rtf/) {
494	close(CHK);
495	return "rtf";
496	}
497	$first = 0;
498	}
499
500	# is this is a word 6/7/8 document?
501	if ($line =~ m/Word\.Document\.([678])/) {
502	close(CHK);
503
504	return "word$1";
505	}
506
507	}
508
509	return "unknown";
510	}
511
512
513	# Specific type-to-type conversions
514	#
515	# Each of the following functions attempts to convert a document from
516	# a specific format to another. If they succeed they return 1 and leave
517	# the output document(s) in the appropriate place; if they fail they
518	# return 0 and delete any working files.
519
520
521	# Attempt to convert a word document to html with the wv program
522	sub doc_to_html {
523	my ($input_filename, $output_filestem) = @_;
524
525	my $wvware_status = 0;
526
527	# need to ensure that the path to perl is quoted (in case there's spaces in it)
528	my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl $input_filename $output_filestem $faillogfile $verbosity $timeout";
529
530	# print STDERR "***** wvware launch cmd = $launch_cmd\n";
531
532	$wvware_status = system($launch_cmd)/256;
533	return $wvware_status;
534	}
535
536	# Attempt to convert a word document to html with the word2html scripting program
537	sub native_doc_to_html {
538	my ($input_filename, $output_filestem) = @_;
539
540	# build up the path to the doc-to-html conversion tool we're going to use
541	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
542
543	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
544	# if windows scripting with docx input, use new VBscript to get the local Word install (if
545	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
546
547	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
548	# else script launch fails when there are error msgs
549	$vbScript = &util::filename_cat($vbScript, "docx2html.vbs");
550	$vbScript = "CScript //Nologo \"$vbScript\""; # launche with CScript for error output in STDERR
551	# //Nologo flag avoids Microsoft's opening/logo msgs
552	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
553	print STDERR " This may take some time. Please wait...\n";
554	}
555	else { # old doc versions. use the usual VB executable word2html for the
556	# conversion. Doesn't need full path, since bin\windows is on PATH
557	$vbScript = "word2html"; #$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
558	}
559	}
560	else { # not windows
561	$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
562	}
563
564	if (-e "$output_filestem.html") {
565	print STDERR " The conversion file:\n";
566	print STDERR " $output_filestem.html\n";
567	print STDERR " ... already exists. Skipping\n";
568	return 1;
569	}
570
571	my $cmd = "";
572	if ($timeout) {$cmd = "ulimit -t $timeout;";}
573	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
574	#$cmd .= "$vbScript $input_filename $output_filestem.html";
575	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
576
577	# redirecting STDERR
578
579	$cmd .= " 2> \"$output_filestem.err\""
580	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
581	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
582
583	# execute the command
584	$!=0;
585	if (system($cmd)!=0)
586	{
587	print STDERR "Error executing $vbScript converter:$!\n";
588	if (-s "$output_filestem.err") {
589	open (ERRFILE, "<$output_filestem.err");
590
591	my $write_to_fail_log=0;
592	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
593	{$write_to_fail_log=1;}
594
595	my $line;
596	while ($line=<ERRFILE>) {
597	if ($line =~ m/\w/) {
598	print STDERR "$line";
599	print FAILLOG "$line" if ($write_to_fail_log);
600	}
601	if ($line !~ m/startup error/) {next;}
602	print STDERR " (given an invalid .DOC file?)\n";
603	print FAILLOG " (given an invalid .DOC file?)\n"
604	if ($write_to_fail_log);
605
606	} # while ERRFILE
607	close FAILLOG if ($write_to_fail_log);
608	}
609	return 0; # we can try any_to_text
610	}
611
612	# Was the conversion successful?
613	if (-s "$output_filestem.html") {
614	open(TMP, "$output_filestem.html");
615	my $line = <TMP>;
616	close(TMP);
617	if ($line && $line =~ m/html/i) {
618	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
619	return 1;
620	}
621	}
622
623	# If here, an error of some sort occurred
624	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
625	if (-e "$output_filestem.err") {
626	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
627	open (ERRLOG,"$output_filestem.err");
628	while (<ERRLOG>) {print FAILLOG $_;}
629	close FAILLOG;
630	close ERRLOG;
631	}
632	&util::rm("$output_filestem.err");
633	}
634	return 0;
635	}
636
637	# Attempt to convert an RTF document to html with rtftohtml
638	sub rtf_to_html {
639	my ($input_filename, $output_filestem) = @_;
640
641	# formulate the command
642	my $cmd = "";
643	if ($timeout) {$cmd = "ulimit -t $timeout;";}
644	$cmd .= "rtftohtml";
645	#$cmd .= "rtf-converter";
646
647	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
648
649	$cmd .= " 2>\"$output_filestem.err\""
650	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
651
652
653	# execute the command
654	$!=0;
655	if (system($cmd)!=0)
656	{
657	print STDERR "Error executing rtf converter $!\n";
658	# don't currently bother printing out error log...
659	# keep going, in case it still created an HTML file...
660	}
661
662	# Was the conversion successful?
663	my $was_successful=0;
664	if (-s "$output_filestem.html") {
665	# make sure we have some content other than header
666	open (HTML, "$output_filestem.html"); # what to do if fail?
667	my $line;
668	my $past_header=0;
669	while ($line=<HTML>) {
670
671	if ($past_header == 0) {
672	if ($line =~ m/<body>/) {$past_header=1;}
673	next;
674	}
675
676	$line =~ s/<[^>]+>//g;
677	if ($line =~ m/\w/ && $past_header) { # we found some content...
678	$was_successful=1;
679	last;
680	}
681	}
682	close HTML;
683	}
684
685	if ($was_successful) {
686	&util::rm("$output_filestem.err")
687	if (-e "$output_filestem.err");
688	# insert the (modified) table of contents, if it exists.
689	if (-e "${output_filestem}_ToC.html") {
690	&util::mv("$output_filestem.html","$output_filestem.src");
691	my $open_failed=0;
692	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
693	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
694	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
695
696	if ($open_failed) {
697	close HTMLSRC;
698	close TOC;
699	close HTML;
700	&util::mv("$output_filestem.src","$output_filestem.html");
701	return 1;
702	}
703
704	# print out header info from src html.
705	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
706	print HTML "$_";
707	}
708
709	# print out table of contents, making links relative
710	<TOC>; <TOC>; # ignore first 2 lines
711	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
712	my $line;
713	while ($line=<TOC>) {
714	$line =~ s@</body></html>$@@i ; # only last line has this
715	# make link relative
716	$line =~ s@href=\"[^\#]+@href=\"@i;
717	print HTML $line;
718	}
719	close TOC;
720
721	# rest of html src
722	while (<HTMLSRC>) {
723	print HTML $_;
724	}
725	close HTMLSRC;
726	close HTML;
727
728	&util::rm("${output_filestem}_ToC.html");
729	&util::rm("${output_filestem}.src");
730	}
731	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
732	return 1; # success
733	}
734
735	if (-e "$output_filestem.err") {
736	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
737	{
738	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
739	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
740	print FAILLOG " (rtf file might be too recent):\n";
741	open (ERRLOG, "$output_filestem.err");
742	while (<ERRLOG>) {print FAILLOG $_;}
743	close ERRLOG;
744	close FAILLOG;
745	}
746	&util::rm("$output_filestem.err");
747	}
748
749	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
750
751	return 0;
752	}
753
754
755	# Convert a pdf file to html with the pdftohtml command
756
757	sub pdf_to_html {
758	my ($dirname, $input_filename, $output_filestem) = @_;
759
760	my $cmd = "";
761	if ($timeout) {$cmd = "ulimit -t $timeout;";}
762	my $full_perl_path = &util::get_perl_exec();
763	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
764	$cmd .= " -c" if ($pdf_complex);
765	$cmd .= " -i" if ($pdf_ignore_images);
766	$cmd .= " -a" if ($pdf_allow_images_only);
767	$cmd .= " -hidden" unless ($pdf_nohidden);
768	$cmd .= " \"$input_filename\" \"$output_filestem\"";
769
770	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
771	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
772	} else {
773	$cmd .= " > \"$output_filestem.err\"";
774	}
775
776	$!=0;
777
778	my $retval=system($cmd);
779	if ($retval!=0)
780	{
781	print STDERR "Error executing pdftohtml.pl";
782	if ($!) {print STDERR ": $!";}
783	print STDERR "\n";
784	}
785
786	# make sure the converter made something
787	if ($retval!=0 \|\| ! -s "$output_filestem.html")
788	{
789	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
790	# print out the converter's std err, if any
791	if (-s "$output_filestem.err") {
792	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
793	print STDERR "pdftohtml error log:\n";
794	while (<ERRLOG>) {
795	print STDERR "$_";
796	}
797	close ERRLOG;
798	}
799	print STDERR "***********output filestem $output_filestem.html\n";
800	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
801	if (-e "$output_filestem.err") {
802	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
803	{
804	open (ERRLOG, "$output_filestem.err");
805	while (<ERRLOG>) {print FAILLOG $_;}
806	close ERRLOG;
807	close FAILLOG;
808	}
809	&util::rm("$output_filestem.err");
810	}
811	return 0;
812	}
813
814	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
815	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
816	return 1;
817	}
818
819	# Convert a pdf file to various types of image with the convert command
820
821	sub pdfps_to_img {
822	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
823
824	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
825	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
826	my $result = `identify 2>&1`;
827	if ($? == -1 \|\| $? == 256) { # Linux and Windows return different values for "program not found"
828	#ImageMagick is not installed, thus the convert utility is not available.
829	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
830	return 0;
831	}
832	}
833
834	my $cmd = "";
835	if ($timeout) {$cmd = "ulimit -t $timeout;";}
836	$output_type =~ s/.\_(.)/$1/i;
837	my $full_perl_path = &util::get_perl_exec();
838	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
839	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
840	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
841	} else {
842	$cmd .= " > \"$output_filestem.err\"";
843	}
844
845	# don't include path on windows (to avoid having to play about
846	# with quoting when GSDLHOME might contain spaces) but assume
847	# that the PATH is set up correctly
848	$!=0;
849	my $retval=system($cmd);
850	if ($retval!=0)
851	{
852	print STDERR "Error executing pdftoimg.pl";
853	if ($!) {print STDERR ": $!";}
854	print STDERR "\n";
855	}
856
857	#make sure the converter made something
858	#if ($retval !=0) \|\| ! -s "$output_filestem")
859	if ($retval !=0)
860	{
861	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
862	#print out the converter's std err, if any
863	if (-s "$output_filestem.err") {
864	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
865	print STDERR "pdfpstoimg error log:\n";
866	while (<ERRLOG>) {
867	print STDERR "$_";
868	}
869	close ERRLOG;
870	}
871	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
872	if (-e "$output_filestem.err") {
873	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
874	{
875	open (ERRLOG, "$output_filestem.err");
876	while (<ERRLOG>) {print FAILLOG $_;}
877	close ERRLOG;
878	close FAILLOG;
879	}
880	&util::rm("$output_filestem.err");
881	}
882	return 0;
883	}
884	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
885	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
886	return 1;
887	}
888
889	# Convert a PDF file to text with the pdftotext command
890
891	sub pdf_to_text {
892	my ($dirname, $input_filename, $output_filestem) = @_;
893
894	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
895
896	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
897	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
898	} else {
899	$cmd .= " > \"$output_filestem.err\"";
900	}
901
902	if (system($cmd)!=0)
903	{
904	print STDERR "Error executing $cmd: $!\n";
905	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
906	}
907
908	# make sure there is some extracted text.
909	if (-e "$output_filestem.text") {
910	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
911	binmode(EXTR_TEXT); # just in case...
912	my $line="";
913	my $seen_text=0;
914	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
915	if ($line=~ m/\w/) {$seen_text=1;}
916	}
917	close EXTR_TEXT;
918	if ($seen_text==0) { # no text was extracted
919	print STDERR "Error: pdftotext found no text\n";
920	&util::rm("$output_filestem.text");
921	}
922	}
923
924	# make sure the converter made something
925	if (! -s "$output_filestem.text")
926	{
927	# print out the converters std err, if any
928	if (-s "$output_filestem.err") {
929	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
930	print STDERR "pdftotext error log:\n";
931	while (<ERRLOG>) {
932	print STDERR "$_";
933	}
934	close ERRLOG;
935	}
936	# does this converter create a .out file?
937	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
938	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
939	if (-e "$output_filestem.err") {
940	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
941	{
942	open (ERRLOG,"$output_filestem.err");
943	while (<ERRLOG>) {print FAILLOG $_;}
944	close ERRLOG;
945	close FAILLOG;
946	}
947	&util::rm("$output_filestem.err");
948	}
949	return 0;
950	}
951	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
952	return 1;
953	}
954
955	# Convert a PostScript document to text
956	# note - just using "ps2ascii" isn't good enough, as it
957	# returns 0 for a postscript interpreter error. ps2ascii is just
958	# a wrapper to "gs" anyway, so we use that cmd here.
959
960	sub ps_to_text {
961	my ($input_filename, $output_filestem) = @_;
962
963	my $error = "";
964
965	# if we're on windows we'll fall straight through without attempting
966	# to use gs
967	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
968	$error = "Windows does not support gs";
969
970	} else {
971	my $cmd = "";
972	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
973	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
974	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
975	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
976	$cmd .= " 2> $output_filestem.err";
977	$!=0;
978
979	my $retcode=system($cmd);
980	$retcode = $? >> 8; # see man perlfunc - system for this...
981	# if system returns -1 \| 127 (couldn't start program), look at $! for message
982
983	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
984	elsif (! -e "$output_filestem.text") {
985	$error="did not create output file.\n";
986	}
987	else
988	{ # make sure the interpreter didn't get an error. It is technically
989	# possible for the actual text to start with this, but....
990	open PSOUT, "$output_filestem.text";
991	if (<PSOUT> =~ m/^Error: (.*)/) {
992	$error="interpreter error - \"$1\"";
993	}
994	close PSOUT;
995	}
996	}
997
998	if ($error ne "")
999	{
1000	print STDERR "Warning: Error executing gs: $error\n";
1001	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1002
1003	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1004	{
1005	print FAILLOG "gs - $error\n";
1006	if (-e "$output_filestem.err") {
1007	open(ERRLOG, "$output_filestem.err");
1008	while (<ERRLOG>) {print FAILLOG $_;}
1009	close ERRLOG;
1010	}
1011	close FAILLOG;
1012	}
1013	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1014
1015
1016	# Fine then. We'll just do a lousy job by ourselves...
1017	# Based on 5-line regexp sed script found at:
1018	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1019	#
1020	print STDERR "Stripping text from postscript\n";
1021	my $errorcode=0;
1022	open (IN, "$input_filename")
1023	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1024	open (OUT, ">$output_filestem.text")
1025	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1026	if ($errorcode) {print STDERR "errors\n";return 0;}
1027
1028	my $text=""; # this is for whole .ps file...
1029	$text = join('', <IN>); # see man perlport, under "System Resources"
1030	close IN;
1031
1032	# Make sure this is a ps file...
1033	if ($text !~ m/^%!/) {
1034	print STDERR "Bad postscript header: not '%!'\n";
1035	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1036	{
1037	print FAILLOG "Bad postscript header: not '%!'\n";
1038	close FAILLOG;
1039	}
1040	return 0;
1041	}
1042
1043	# if ps has Page data, then use it to delete all stuff before it.
1044	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1045
1046	# remove all leading non-data stuff
1047	$text =~ s/^.*?\(//s;
1048
1049	# remove all newline chars for easier processing
1050	$text =~ s/\n//g;
1051
1052	# Big assumption here - assume that if any co-ordinates are
1053	# given, then we are at the end of a sentence.
1054	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1055
1056	# special characters--
1057	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1058
1059	# ? ps text formatting (eg italics?) ?
1060	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1061	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1062	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1063	# default - remove the rest
1064	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1065
1066	# attempt to add whitespace between words...
1067	# this is based purely on observation, and may be completely wrong...
1068	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1069	# eg I notice "b(" is sometimes NOT a space if preceded by a
1070	# negative number.
1071	$text =~ s/\)\d+ ?b\(/\) \( /g;
1072
1073	# change quoted braces to brackets
1074	$text =~ s/([^\\])\\\(/$1\{/g;
1075	$text =~ s/([^\\])\\\)/$1\}/g ;
1076
1077	# remove everything that is not between braces
1078	$text =~ s/\)([^\(\)])+?\(//sg ;
1079
1080	# remove any Trailer eof stuff.
1081	$text =~ s/\)[^\)]*$//sg;
1082
1083	### ligatures have special characters...
1084	$text =~ s/\\013/ff/g;
1085	$text =~ s/\\014/fi/g;
1086	$text =~ s/\\015/fl/g;
1087	$text =~ s/\\016/ffi/g;
1088	$text =~ s/\\214/fi/g;
1089	$text =~ s/\\215/fl/g;
1090	$text =~ s/\\017/\n\* /g; # asterisk?
1091	$text =~ s/\\023/\023/g; # e acute ('e)
1092	$text =~ s/\\177/\252/g; # u"
1093	# $text =~ s/ ?? /\344/g; # a"
1094
1095	print OUT "$text";
1096	close OUT;
1097	}
1098	# wrap the text - use a minimum length. ie, first space after this length.
1099	my $wrap_length=72;
1100	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1101	open INFILE, "$output_filestem.text.tmp" \|\|
1102	die "Couldn't open file: $!";
1103	open OUTFILE, ">$output_filestem.text" \|\|
1104	die "Couldn't open file for writing: $!";
1105	my $line="";
1106	while ($line=<INFILE>) {
1107	while (length($line)>0) {
1108	if (length($line)>$wrap_length) {
1109	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1110	print OUTFILE "$1\n";
1111	} else {
1112	print OUTFILE "$line";
1113	$line="";
1114	}
1115	}
1116	}
1117	close INFILE;
1118	close OUTFILE;
1119	&util::rm("$output_filestem.text.tmp");
1120
1121	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1122	return 1;
1123	}
1124
1125
1126	# Convert any file to HTML with a crude perl implementation of the
1127	# UNIX strings command.
1128
1129	sub any_to_html {
1130	my ($input_filename, $output_filestem) = @_;
1131
1132	# First generate a text file
1133	return 0 unless (&any_to_text($input_filename, $output_filestem));
1134
1135	# create an HTML file from the text file
1136	open(TEXT, "<$output_filestem.text");
1137	open(HTML, ">$output_filestem.html");
1138
1139	print HTML "<html><head>\n";
1140	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1141	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1142	print HTML "</head><body>\n\n";
1143
1144	my $line;
1145	while ($line=<TEXT>) {
1146	$line =~ s/</</g;
1147	$line =~ s/>/>/g;
1148	if ($line =~ m/^\s*$/) {
1149	print HTML "<p>";
1150	} else {
1151	print HTML "<br> ", $line;
1152	}
1153	}
1154	print HTML "\n</body></html>\n";
1155
1156	close HTML;
1157	close TEXT;
1158
1159	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1160	return 1;
1161	}
1162
1163	# Convert any file to TEXT with a crude perl implementation of the
1164	# UNIX strings command.
1165	# Note - this assumes ascii charsets :( (jrm21)
1166
1167	sub any_to_text {
1168	my ($input_filename, $output_filestem) = @_;
1169
1170	if (!$use_strings) {
1171	return 0;
1172	}
1173
1174	print STDERR "\n** In any to text**\n\n";
1175	open(IN, "<$input_filename") \|\| return 0;
1176	binmode(IN);
1177	open(OUT, ">$output_filestem.text") \|\| return 0;
1178
1179	my ($line);
1180	my $output_line_count = 0;
1181	while (<IN>) {
1182	$line = $_;
1183
1184	# delete anything that isn't a printable character
1185	$line =~ s/[^\040-\176]+/\n/sg;
1186
1187	# delete any string less than 10 characters long
1188	$line =~ s/^.{0,9}$/\n/mg;
1189	while ($line =~ m/^.{1,9}$/m) {
1190	$line =~ s/^.{0,9}$/\n/mg;
1191	$line =~ s/\n+/\n/sg;
1192	}
1193
1194	# remove extraneous whitespace
1195	$line =~ s/\n+/\n/gs;
1196	$line =~ s/^\n//gs;
1197
1198	# output whatever is left
1199	if ($line =~ m/[^\n ]/) {
1200	print OUT $line;
1201	++$output_line_count;
1202	}
1203	}
1204
1205	close OUT;
1206	close IN;
1207
1208	if ($output_line_count) { # try to protect against binary only formats
1209	return 1;
1210	}
1211
1212	&util::rm("$output_filestem.text");
1213	return 0;
1214
1215	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: