Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 24371

Last change on this file since 24371 was 24371, checked in by ak19, 13 years ago
Ticket 779: the new wvware.pl script sets the environment for what wvware needs, by setting the LD_LIB_PATH to gnome-lib-minimal in the extension folder, if this exists. wvware.pl is called by gsConvert to run wvware (also checked with the replace src doc with html menu option on rightclick) and the perl script can be launched from the command prompt to do the conversion as well.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 34.9 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use strict;
52
53	use parsargv;
54	use util;
55	use Cwd;
56
57	# Are we running on WinNT or Win2000 (or later)?
58	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
59	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
60
61	my $use_strings;
62	my $pdf_complex;
63	my $pdf_nohidden;
64	my $pdf_zoom;
65	my $pdf_ignore_images;
66	my $pdf_allow_images_only;
67	my $windows_scripting;
68
69	sub print_usage
70	{
71	print STDERR "\n";
72	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
73	print STDERR " or text using third-party programs.\n\n";
74	print STDERR " usage: $0 [options] filename\n";
75	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
76	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
77	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
78	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
79	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
80	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
81	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
82	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
83	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
84	print STDERR "\t\tconverting PDF to HTML\n";
85	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
86	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
87	print STDERR "\t\t-pdf_complex is set\n";
88	exit(1);
89	}
90
91	my $faillogfile="";
92	my $timeout=0;
93
94	sub main
95	{
96	my (@ARGV) = @_;
97	my ($input_type,$output_type,$verbose);
98
99	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
100	# is in use or not
101	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
102	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
103	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
104	# Currently only have VBA for Word and PPT(but no XLS)
105	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
106
107	my $type_re = $default_type_re;
108
109	foreach my $a (@ARGV) {
110	if ($a =~ m/^windows_scripting$/i) {
111	$type_re = $enhanced_type_re;
112	}
113	}
114
115	# read command-line arguments
116	if (!parsargv::parse(\@ARGV,
117	"type/$type_re/", \$input_type,
118	'/errlog/.*/', \$faillogfile,
119	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
120	'timeout/\d+/0',\$timeout,
121	'verbose/\d+/0', \$verbose,
122	'windows_scripting',\$windows_scripting,
123	'use_strings', \$use_strings,
124	'pdf_complex', \$pdf_complex,
125	'pdf_ignore_images', \$pdf_ignore_images,
126	'pdf_allow_images_only', \$pdf_allow_images_only,
127	'pdf_nohidden', \$pdf_nohidden,
128	'pdf_zoom/\d+/2', \$pdf_zoom
129	))
130	{
131	print_usage();
132	}
133
134	# Make sure the input file exists and can be opened for reading
135	if (scalar(@ARGV!=1)) {
136	print_usage();
137	}
138
139	my $input_filename = $ARGV[0];
140	if (!-r $input_filename) {
141	print STDERR "Error: unable to open $input_filename for reading\n";
142	exit(1);
143	}
144
145	# Deduce filenames
146	my ($tailname,$dirname,$suffix)
147	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
148	my $output_filestem = &util::filename_cat($dirname, "$tailname");
149
150	if ($input_type eq "")
151	{
152	$input_type = lc (substr($suffix,1,length($suffix)-1));
153	}
154
155	# Change to temporary working directory
156	my $stored_dir = cwd();
157	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
158
159	# Select convert utility
160	if (!defined $input_type) {
161	print STDERR "Error: No filename extension or input type defined\n";
162	exit(1);
163	}
164	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
165	print &convertDOC($input_filename, $output_filestem, $output_type);
166	print "\n";
167	}
168	elsif ($input_type eq "rtf") {
169	print &convertRTF($input_filename, $output_filestem, $output_type);
170	print "\n";
171	}
172	elsif ($input_type eq "pdf") {
173	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
174	print "\n";
175	}
176	elsif ($input_type eq "ps") {
177	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
178	print "\n";
179	}
180	elsif ($input_type =~ m/pptx?$/) {
181	print &convertPPT($input_filename, $output_filestem, $output_type);
182	print "\n";
183	}
184	elsif ($input_type =~ m/xlsx?$/) {
185	print &convertXLS($input_filename, $output_filestem, $output_type);
186	print "\n";
187	}
188	else {
189	print STDERR "Error: Unable to convert type '$input_type'\n";
190	exit(1);
191	}
192
193	# restore to original working directory
194	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
195
196	}
197
198	&main(@ARGV);
199
200
201
202	# Document-type conversion functions
203	#
204	# The following functions attempt to convert documents from their
205	# input type to the specified output type. If no output type was
206	# given, then they first attempt HTML, and then TEXT.
207	#
208	# Each returns the output type ("html" or "text") or "fail" if no
209	# conversion is possible.
210
211	# Convert a Microsoft word document
212
213	sub convertDOC {
214	my ($input_filename, $output_filestem, $output_type) = @_;
215
216	# Many .doc files are not in fact word documents!
217	my $realtype = &find_docfile_type($input_filename);
218
219	if ($realtype eq "word6" \|\| $realtype eq "word7"
220	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
221	return &convertWord678($input_filename, $output_filestem, $output_type);
222	} elsif ($realtype eq "rtf") {
223	return &convertRTF($input_filename, $output_filestem, $output_type);
224	} else {
225	return &convertAnything($input_filename, $output_filestem, $output_type);
226	}
227	}
228
229	# Convert a Microsoft word 6/7/8 document
230
231	sub convertWord678 {
232	my ($input_filename, $output_filestem, $output_type) = @_;
233
234	my $success = 0;
235	if (!$output_type \|\| ($output_type =~ m/html/i)){
236	if ($windows_scripting) {
237	$success = &native_doc_to_html($input_filename, $output_filestem);
238	}
239	else {
240	$success = &doc_to_html($input_filename, $output_filestem);
241	}
242	if ($success) {
243	return "html";
244	}
245	}
246	return &convertAnything($input_filename, $output_filestem, $output_type);
247	}
248
249
250	# Convert a Rich Text Format (RTF) file
251
252	sub convertRTF {
253	my ($input_filename, $output_filestem, $output_type) = @_;
254
255	my $success = 0;
256
257	# Attempt specialised conversion to HTML
258	if (!$output_type \|\| ($output_type =~ m/html/i)) {
259
260	if ($windows_scripting) {
261	$success = &native_doc_to_html($input_filename, $output_filestem);
262	}
263	else {
264	$success = &rtf_to_html($input_filename, $output_filestem);
265	}
266	if ($success) {
267	return "html";
268	}
269	}
270
271	# rtf is so ugly that's it's not worth running strings over.
272	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
273	# return &convertAnything($input_filename, $output_filestem, $output_type);
274	return "fail";
275	}
276
277
278	# Convert an unidentified file
279
280	sub convertAnything {
281	my ($input_filename, $output_filestem, $output_type) = @_;
282
283	my $success = 0;
284
285	# Attempt simple conversion to HTML
286	if (!$output_type \|\| ($output_type =~ m/html/i)) {
287	$success = &any_to_html($input_filename, $output_filestem);
288	if ($success) {
289	return "html";
290	}
291	}
292
293	# Convert to text
294	if (!$output_type \|\| ($output_type =~ m/text/i)) {
295	$success = &any_to_text($input_filename, $output_filestem);
296	if ($success) {
297	return "text";
298	}
299	}
300	return "fail";
301	}
302
303
304
305	# Convert an Adobe PDF document
306
307	sub convertPDF {
308	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
309
310	my $success = 0;
311	$output_type =~ s/.\-(.)/$1/i;
312	# Attempt coversion to Image
313	if ($output_type =~ m/jp?g\|gif\|png/i) {
314	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
315	if ($success){
316	return "item";
317	}
318	}
319
320	# Attempt conversion to HTML
321	if (!$output_type \|\| ($output_type =~ m/html/i)) {
322	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
323	if ($success) {
324	return "html";
325	}
326	}
327
328	# Attempt conversion to TEXT
329	if (!$output_type \|\| ($output_type =~ m/text/i)) {
330	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
331	if ($success) {
332	return "text";
333	}
334	}
335
336	return "fail";
337
338	}
339
340
341	# Convert an Adobe PostScript document
342
343	sub convertPS {
344	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
345
346	my $success = 0;
347	$output_type =~ s/.\-(.)/$1/i;
348	# Attempt coversion to Image
349	if ($output_type =~ m/jp?g\|gif\|png/i) {
350	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
351	if ($success){
352	return "item";
353	}
354	}
355
356	# Attempt conversion to TEXT
357	if (!$output_type \|\| ($output_type =~ m/text/i)) {
358	$success = &ps_to_text($input_filename, $output_filestem);
359	if ($success) {
360	return "text";
361	}
362	}
363	return "fail";
364	}
365
366
367	sub convertPPT {
368	my ($input_filename, $output_filestem, $output_type) = @_;
369	my $success = 0;
370
371	my $ppt_convert_type = "";
372
373	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
374	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
375	if ($output_type =~ m/gif/i) {
376	$ppt_convert_type = "-g";
377	} elsif ($output_type =~ m/jp?g/i){
378	$ppt_convert_type = "-j";
379	} elsif ($output_type =~ m/png/i){
380	$ppt_convert_type = "-p";
381	}
382	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
383	$ENV{'GSDLOS'}, "pptextract");
384	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
385
386	my $cmd = "";
387	if ($timeout) {$cmd = "ulimit -t $timeout;";}
388	# if the converting directory already exists
389	if (-d $output_filestem) {
390	print STDERR "**The conversion directory already exists\n";
391	return "item";
392	} else {
393	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
394	$cmd .= " 2>\"$output_filestem.err\""
395	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
396	if (system($cmd) !=0) {
397	print STDERR "Powerpoint VB Scripting convert failed\n";
398	} else {
399	return "item";
400	}
401	}
402	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
403	# Attempt conversion to HTML
404	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
405	# formulate the command
406	my $cmd = "";
407	my $full_perl_path = &util::get_perl_exec();
408	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
409	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
410	$cmd .= " 2>\"$output_filestem.err\""
411	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
412
413	# execute the command
414	$!=0;
415	if (system($cmd)!=0)
416	{
417	print STDERR "Powerpoint 95/97 converter failed $!\n";
418	} else {
419	return "html";
420	}
421	}
422
423	$success = &any_to_text($input_filename, $output_filestem);
424	if ($success) {
425	return "text";
426	}
427
428	return "fail";
429	}
430
431
432	sub convertXLS {
433	my ($input_filename, $output_filestem, $output_type) = @_;
434
435	my $success = 0;
436
437	# Attempt conversion to HTML
438	if (!$output_type \|\| ($output_type =~ m/html/i)) {
439	# formulate the command
440	my $cmd = "";
441	my $full_perl_path = &util::get_perl_exec();
442	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
443	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
444	$cmd .= " 2>\"$output_filestem.err\""
445	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
446
447
448	# execute the command
449	$!=0;
450	if (system($cmd)!=0)
451	{
452	print STDERR "Excel 95/97 converter failed $!\n";
453	} else {
454	return "html";
455	}
456	}
457
458	$success = &any_to_text($input_filename, $output_filestem);
459	if ($success) {
460	return "text";
461	}
462
463	return "fail";
464	}
465
466
467
468	# Find the real type of a .doc file
469	#
470	# We seem to have a lot of files with a .doc extension that are .rtf
471	# files or Word 5 files. This function attempts to tell the difference.
472	sub find_docfile_type {
473	my ($input_filename) = @_;
474
475	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
476	return "docx";
477	}
478
479	open(CHK, "<$input_filename");
480	binmode(CHK);
481	my $line = "";
482	my $first = 1;
483
484	while (<CHK>) {
485
486	$line = $_;
487
488	if ($first) {
489	# check to see if this is an rtf file
490	if ($line =~ m/^\{\\rtf/) {
491	close(CHK);
492	return "rtf";
493	}
494	$first = 0;
495	}
496
497	# is this is a word 6/7/8 document?
498	if ($line =~ m/Word\.Document\.([678])/) {
499	close(CHK);
500
501	return "word$1";
502	}
503
504	}
505
506	return "unknown";
507	}
508
509
510	# Specific type-to-type conversions
511	#
512	# Each of the following functions attempts to convert a document from
513	# a specific format to another. If they succeed they return 1 and leave
514	# the output document(s) in the appropriate place; if they fail they
515	# return 0 and delete any working files.
516
517
518	# Attempt to convert a word document to html with the wv program
519	sub doc_to_html {
520	my ($input_filename, $output_filestem) = @_;
521
522	my $wvware_status = 0;
523
524	# need to ensure that the path to perl is quoted (in case there's spaces in it)
525	my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl $input_filename $output_filestem $faillogfile $timeout";
526
527	# print STDERR "***** wvware launch cmd = $launch_cmd\n";
528
529	$wvware_status = system($launch_cmd)/256;
530	return $wvware_status;
531	}
532
533	# Attempt to convert a word document to html with the word2html scripting program
534	sub native_doc_to_html {
535	my ($input_filename, $output_filestem) = @_;
536
537	# build up the path to the doc-to-html conversion tool we're going to use
538	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
539
540	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
541	# if windows scripting with docx input, use new VBscript to get the local Word install (if
542	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
543
544	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
545	# else script launch fails when there are error msgs
546	$vbScript = &util::filename_cat($vbScript, "docx2html.vbs");
547	$vbScript = "CScript //Nologo \"$vbScript\""; # launche with CScript for error output in STDERR
548	# //Nologo flag avoids Microsoft's opening/logo msgs
549	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
550	print STDERR " This may take some time. Please wait...\n";
551	}
552	else { # old doc versions. use the usual VB executable word2html for the
553	# conversion. Doesn't need full path, since bin\windows is on PATH
554	$vbScript = "word2html"; #$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
555	}
556	}
557	else { # not windows
558	$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
559	}
560
561	if (-e "$output_filestem.html") {
562	print STDERR " The conversion file:\n";
563	print STDERR " $output_filestem.html\n";
564	print STDERR " ... already exists. Skipping\n";
565	return 1;
566	}
567
568	my $cmd = "";
569	if ($timeout) {$cmd = "ulimit -t $timeout;";}
570	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
571	#$cmd .= "$vbScript $input_filename $output_filestem.html";
572	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
573
574	# redirecting STDERR
575
576	$cmd .= " 2> \"$output_filestem.err\""
577	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
578	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
579
580	# execute the command
581	$!=0;
582	if (system($cmd)!=0)
583	{
584	print STDERR "Error executing $vbScript converter:$!\n";
585	if (-s "$output_filestem.err") {
586	open (ERRFILE, "<$output_filestem.err");
587
588	my $write_to_fail_log=0;
589	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
590	{$write_to_fail_log=1;}
591
592	my $line;
593	while ($line=<ERRFILE>) {
594	if ($line =~ m/\w/) {
595	print STDERR "$line";
596	print FAILLOG "$line" if ($write_to_fail_log);
597	}
598	if ($line !~ m/startup error/) {next;}
599	print STDERR " (given an invalid .DOC file?)\n";
600	print FAILLOG " (given an invalid .DOC file?)\n"
601	if ($write_to_fail_log);
602
603	} # while ERRFILE
604	close FAILLOG if ($write_to_fail_log);
605	}
606	return 0; # we can try any_to_text
607	}
608
609	# Was the conversion successful?
610	if (-s "$output_filestem.html") {
611	open(TMP, "$output_filestem.html");
612	my $line = <TMP>;
613	close(TMP);
614	if ($line && $line =~ m/html/i) {
615	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
616	return 1;
617	}
618	}
619
620	# If here, an error of some sort occurred
621	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
622	if (-e "$output_filestem.err") {
623	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
624	open (ERRLOG,"$output_filestem.err");
625	while (<ERRLOG>) {print FAILLOG $_;}
626	close FAILLOG;
627	close ERRLOG;
628	}
629	&util::rm("$output_filestem.err");
630	}
631	return 0;
632	}
633
634	# Attempt to convert an RTF document to html with rtftohtml
635	sub rtf_to_html {
636	my ($input_filename, $output_filestem) = @_;
637
638	# formulate the command
639	my $cmd = "";
640	if ($timeout) {$cmd = "ulimit -t $timeout;";}
641	$cmd .= "rtftohtml";
642	#$cmd .= "rtf-converter";
643
644	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
645
646	$cmd .= " 2>\"$output_filestem.err\""
647	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
648
649
650	# execute the command
651	$!=0;
652	if (system($cmd)!=0)
653	{
654	print STDERR "Error executing rtf converter $!\n";
655	# don't currently bother printing out error log...
656	# keep going, in case it still created an HTML file...
657	}
658
659	# Was the conversion successful?
660	my $was_successful=0;
661	if (-s "$output_filestem.html") {
662	# make sure we have some content other than header
663	open (HTML, "$output_filestem.html"); # what to do if fail?
664	my $line;
665	my $past_header=0;
666	while ($line=<HTML>) {
667
668	if ($past_header == 0) {
669	if ($line =~ m/<body>/) {$past_header=1;}
670	next;
671	}
672
673	$line =~ s/<[^>]+>//g;
674	if ($line =~ m/\w/ && $past_header) { # we found some content...
675	$was_successful=1;
676	last;
677	}
678	}
679	close HTML;
680	}
681
682	if ($was_successful) {
683	&util::rm("$output_filestem.err")
684	if (-e "$output_filestem.err");
685	# insert the (modified) table of contents, if it exists.
686	if (-e "${output_filestem}_ToC.html") {
687	&util::mv("$output_filestem.html","$output_filestem.src");
688	my $open_failed=0;
689	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
690	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
691	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
692
693	if ($open_failed) {
694	close HTMLSRC;
695	close TOC;
696	close HTML;
697	&util::mv("$output_filestem.src","$output_filestem.html");
698	return 1;
699	}
700
701	# print out header info from src html.
702	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
703	print HTML "$_";
704	}
705
706	# print out table of contents, making links relative
707	<TOC>; <TOC>; # ignore first 2 lines
708	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
709	my $line;
710	while ($line=<TOC>) {
711	$line =~ s@</body></html>$@@i ; # only last line has this
712	# make link relative
713	$line =~ s@href=\"[^\#]+@href=\"@i;
714	print HTML $line;
715	}
716	close TOC;
717
718	# rest of html src
719	while (<HTMLSRC>) {
720	print HTML $_;
721	}
722	close HTMLSRC;
723	close HTML;
724
725	&util::rm("${output_filestem}_ToC.html");
726	&util::rm("${output_filestem}.src");
727	}
728	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
729	return 1; # success
730	}
731
732	if (-e "$output_filestem.err") {
733	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
734	{
735	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
736	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
737	print FAILLOG " (rtf file might be too recent):\n";
738	open (ERRLOG, "$output_filestem.err");
739	while (<ERRLOG>) {print FAILLOG $_;}
740	close ERRLOG;
741	close FAILLOG;
742	}
743	&util::rm("$output_filestem.err");
744	}
745
746	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
747
748	return 0;
749	}
750
751
752	# Convert a pdf file to html with the pdftohtml command
753
754	sub pdf_to_html {
755	my ($dirname, $input_filename, $output_filestem) = @_;
756
757	my $cmd = "";
758	if ($timeout) {$cmd = "ulimit -t $timeout;";}
759	my $full_perl_path = &util::get_perl_exec();
760	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
761	$cmd .= " -c" if ($pdf_complex);
762	$cmd .= " -i" if ($pdf_ignore_images);
763	$cmd .= " -a" if ($pdf_allow_images_only);
764	$cmd .= " -hidden" unless ($pdf_nohidden);
765	$cmd .= " \"$input_filename\" \"$output_filestem\"";
766
767	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
768	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
769	} else {
770	$cmd .= " > \"$output_filestem.err\"";
771	}
772
773	$!=0;
774
775	my $retval=system($cmd);
776	if ($retval!=0)
777	{
778	print STDERR "Error executing pdftohtml.pl";
779	if ($!) {print STDERR ": $!";}
780	print STDERR "\n";
781	}
782
783	# make sure the converter made something
784	if ($retval!=0 \|\| ! -s "$output_filestem.html")
785	{
786	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
787	# print out the converter's std err, if any
788	if (-s "$output_filestem.err") {
789	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
790	print STDERR "pdftohtml error log:\n";
791	while (<ERRLOG>) {
792	print STDERR "$_";
793	}
794	close ERRLOG;
795	}
796	print STDERR "***********output filestem $output_filestem.html\n";
797	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
798	if (-e "$output_filestem.err") {
799	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
800	{
801	open (ERRLOG, "$output_filestem.err");
802	while (<ERRLOG>) {print FAILLOG $_;}
803	close ERRLOG;
804	close FAILLOG;
805	}
806	&util::rm("$output_filestem.err");
807	}
808	return 0;
809	}
810
811	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
812	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
813	return 1;
814	}
815
816	# Convert a pdf file to various types of image with the convert command
817
818	sub pdfps_to_img {
819	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
820
821	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
822	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
823	my $result = `identify 2>&1`;
824	if ($? == -1 \|\| $? == 256) { # Linux and Windows return different values for "program not found"
825	#ImageMagick is not installed, thus the convert utility is not available.
826	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
827	return 0;
828	}
829	}
830
831	my $cmd = "";
832	if ($timeout) {$cmd = "ulimit -t $timeout;";}
833	$output_type =~ s/.\_(.)/$1/i;
834	my $full_perl_path = &util::get_perl_exec();
835	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
836	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
837	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
838	} else {
839	$cmd .= " > \"$output_filestem.err\"";
840	}
841
842	# don't include path on windows (to avoid having to play about
843	# with quoting when GSDLHOME might contain spaces) but assume
844	# that the PATH is set up correctly
845	$!=0;
846	my $retval=system($cmd);
847	if ($retval!=0)
848	{
849	print STDERR "Error executing pdftoimg.pl";
850	if ($!) {print STDERR ": $!";}
851	print STDERR "\n";
852	}
853
854	#make sure the converter made something
855	#if ($retval !=0) \|\| ! -s "$output_filestem")
856	if ($retval !=0)
857	{
858	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
859	#print out the converter's std err, if any
860	if (-s "$output_filestem.err") {
861	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
862	print STDERR "pdfpstoimg error log:\n";
863	while (<ERRLOG>) {
864	print STDERR "$_";
865	}
866	close ERRLOG;
867	}
868	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
869	if (-e "$output_filestem.err") {
870	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
871	{
872	open (ERRLOG, "$output_filestem.err");
873	while (<ERRLOG>) {print FAILLOG $_;}
874	close ERRLOG;
875	close FAILLOG;
876	}
877	&util::rm("$output_filestem.err");
878	}
879	return 0;
880	}
881	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
882	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
883	return 1;
884	}
885
886	# Convert a PDF file to text with the pdftotext command
887
888	sub pdf_to_text {
889	my ($dirname, $input_filename, $output_filestem) = @_;
890
891	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
892
893	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
894	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
895	} else {
896	$cmd .= " > \"$output_filestem.err\"";
897	}
898
899	if (system($cmd)!=0)
900	{
901	print STDERR "Error executing $cmd: $!\n";
902	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
903	}
904
905	# make sure there is some extracted text.
906	if (-e "$output_filestem.text") {
907	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
908	binmode(EXTR_TEXT); # just in case...
909	my $line="";
910	my $seen_text=0;
911	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
912	if ($line=~ m/\w/) {$seen_text=1;}
913	}
914	close EXTR_TEXT;
915	if ($seen_text==0) { # no text was extracted
916	print STDERR "Error: pdftotext found no text\n";
917	&util::rm("$output_filestem.text");
918	}
919	}
920
921	# make sure the converter made something
922	if (! -s "$output_filestem.text")
923	{
924	# print out the converters std err, if any
925	if (-s "$output_filestem.err") {
926	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
927	print STDERR "pdftotext error log:\n";
928	while (<ERRLOG>) {
929	print STDERR "$_";
930	}
931	close ERRLOG;
932	}
933	# does this converter create a .out file?
934	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
935	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
936	if (-e "$output_filestem.err") {
937	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
938	{
939	open (ERRLOG,"$output_filestem.err");
940	while (<ERRLOG>) {print FAILLOG $_;}
941	close ERRLOG;
942	close FAILLOG;
943	}
944	&util::rm("$output_filestem.err");
945	}
946	return 0;
947	}
948	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
949	return 1;
950	}
951
952	# Convert a PostScript document to text
953	# note - just using "ps2ascii" isn't good enough, as it
954	# returns 0 for a postscript interpreter error. ps2ascii is just
955	# a wrapper to "gs" anyway, so we use that cmd here.
956
957	sub ps_to_text {
958	my ($input_filename, $output_filestem) = @_;
959
960	my $error = "";
961
962	# if we're on windows we'll fall straight through without attempting
963	# to use gs
964	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
965	$error = "Windows does not support gs";
966
967	} else {
968	my $cmd = "";
969	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
970	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
971	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
972	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
973	$cmd .= " 2> $output_filestem.err";
974	$!=0;
975
976	my $retcode=system($cmd);
977	$retcode = $? >> 8; # see man perlfunc - system for this...
978	# if system returns -1 \| 127 (couldn't start program), look at $! for message
979
980	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
981	elsif (! -e "$output_filestem.text") {
982	$error="did not create output file.\n";
983	}
984	else
985	{ # make sure the interpreter didn't get an error. It is technically
986	# possible for the actual text to start with this, but....
987	open PSOUT, "$output_filestem.text";
988	if (<PSOUT> =~ m/^Error: (.*)/) {
989	$error="interpreter error - \"$1\"";
990	}
991	close PSOUT;
992	}
993	}
994
995	if ($error ne "")
996	{
997	print STDERR "Warning: Error executing gs: $error\n";
998	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
999
1000	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1001	{
1002	print FAILLOG "gs - $error\n";
1003	if (-e "$output_filestem.err") {
1004	open(ERRLOG, "$output_filestem.err");
1005	while (<ERRLOG>) {print FAILLOG $_;}
1006	close ERRLOG;
1007	}
1008	close FAILLOG;
1009	}
1010	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1011
1012
1013	# Fine then. We'll just do a lousy job by ourselves...
1014	# Based on 5-line regexp sed script found at:
1015	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1016	#
1017	print STDERR "Stripping text from postscript\n";
1018	my $errorcode=0;
1019	open (IN, "$input_filename")
1020	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1021	open (OUT, ">$output_filestem.text")
1022	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1023	if ($errorcode) {print STDERR "errors\n";return 0;}
1024
1025	my $text=""; # this is for whole .ps file...
1026	$text = join('', <IN>); # see man perlport, under "System Resources"
1027	close IN;
1028
1029	# Make sure this is a ps file...
1030	if ($text !~ m/^%!/) {
1031	print STDERR "Bad postscript header: not '%!'\n";
1032	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1033	{
1034	print FAILLOG "Bad postscript header: not '%!'\n";
1035	close FAILLOG;
1036	}
1037	return 0;
1038	}
1039
1040	# if ps has Page data, then use it to delete all stuff before it.
1041	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1042
1043	# remove all leading non-data stuff
1044	$text =~ s/^.*?\(//s;
1045
1046	# remove all newline chars for easier processing
1047	$text =~ s/\n//g;
1048
1049	# Big assumption here - assume that if any co-ordinates are
1050	# given, then we are at the end of a sentence.
1051	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1052
1053	# special characters--
1054	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1055
1056	# ? ps text formatting (eg italics?) ?
1057	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1058	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1059	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1060	# default - remove the rest
1061	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1062
1063	# attempt to add whitespace between words...
1064	# this is based purely on observation, and may be completely wrong...
1065	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1066	# eg I notice "b(" is sometimes NOT a space if preceded by a
1067	# negative number.
1068	$text =~ s/\)\d+ ?b\(/\) \( /g;
1069
1070	# change quoted braces to brackets
1071	$text =~ s/([^\\])\\\(/$1\{/g;
1072	$text =~ s/([^\\])\\\)/$1\}/g ;
1073
1074	# remove everything that is not between braces
1075	$text =~ s/\)([^\(\)])+?\(//sg ;
1076
1077	# remove any Trailer eof stuff.
1078	$text =~ s/\)[^\)]*$//sg;
1079
1080	### ligatures have special characters...
1081	$text =~ s/\\013/ff/g;
1082	$text =~ s/\\014/fi/g;
1083	$text =~ s/\\015/fl/g;
1084	$text =~ s/\\016/ffi/g;
1085	$text =~ s/\\214/fi/g;
1086	$text =~ s/\\215/fl/g;
1087	$text =~ s/\\017/\n\* /g; # asterisk?
1088	$text =~ s/\\023/\023/g; # e acute ('e)
1089	$text =~ s/\\177/\252/g; # u"
1090	# $text =~ s/ ?? /\344/g; # a"
1091
1092	print OUT "$text";
1093	close OUT;
1094	}
1095	# wrap the text - use a minimum length. ie, first space after this length.
1096	my $wrap_length=72;
1097	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1098	open INFILE, "$output_filestem.text.tmp" \|\|
1099	die "Couldn't open file: $!";
1100	open OUTFILE, ">$output_filestem.text" \|\|
1101	die "Couldn't open file for writing: $!";
1102	my $line="";
1103	while ($line=<INFILE>) {
1104	while (length($line)>0) {
1105	if (length($line)>$wrap_length) {
1106	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1107	print OUTFILE "$1\n";
1108	} else {
1109	print OUTFILE "$line";
1110	$line="";
1111	}
1112	}
1113	}
1114	close INFILE;
1115	close OUTFILE;
1116	&util::rm("$output_filestem.text.tmp");
1117
1118	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1119	return 1;
1120	}
1121
1122
1123	# Convert any file to HTML with a crude perl implementation of the
1124	# UNIX strings command.
1125
1126	sub any_to_html {
1127	my ($input_filename, $output_filestem) = @_;
1128
1129	# First generate a text file
1130	return 0 unless (&any_to_text($input_filename, $output_filestem));
1131
1132	# create an HTML file from the text file
1133	open(TEXT, "<$output_filestem.text");
1134	open(HTML, ">$output_filestem.html");
1135
1136	print HTML "<html><head>\n";
1137	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1138	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1139	print HTML "</head><body>\n\n";
1140
1141	my $line;
1142	while ($line=<TEXT>) {
1143	$line =~ s/</</g;
1144	$line =~ s/>/>/g;
1145	if ($line =~ m/^\s*$/) {
1146	print HTML "<p>";
1147	} else {
1148	print HTML "<br> ", $line;
1149	}
1150	}
1151	print HTML "\n</body></html>\n";
1152
1153	close HTML;
1154	close TEXT;
1155
1156	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1157	return 1;
1158	}
1159
1160	# Convert any file to TEXT with a crude perl implementation of the
1161	# UNIX strings command.
1162	# Note - this assumes ascii charsets :( (jrm21)
1163
1164	sub any_to_text {
1165	my ($input_filename, $output_filestem) = @_;
1166
1167	if (!$use_strings) {
1168	return 0;
1169	}
1170
1171	print STDERR "\n** In any to text**\n\n";
1172	open(IN, "<$input_filename") \|\| return 0;
1173	binmode(IN);
1174	open(OUT, ">$output_filestem.text") \|\| return 0;
1175
1176	my ($line);
1177	my $output_line_count = 0;
1178	while (<IN>) {
1179	$line = $_;
1180
1181	# delete anything that isn't a printable character
1182	$line =~ s/[^\040-\176]+/\n/sg;
1183
1184	# delete any string less than 10 characters long
1185	$line =~ s/^.{0,9}$/\n/mg;
1186	while ($line =~ m/^.{1,9}$/m) {
1187	$line =~ s/^.{0,9}$/\n/mg;
1188	$line =~ s/\n+/\n/sg;
1189	}
1190
1191	# remove extraneous whitespace
1192	$line =~ s/\n+/\n/gs;
1193	$line =~ s/^\n//gs;
1194
1195	# output whatever is left
1196	if ($line =~ m/[^\n ]/) {
1197	print OUT $line;
1198	++$output_line_count;
1199	}
1200	}
1201
1202	close OUT;
1203	close IN;
1204
1205	if ($output_line_count) { # try to protect against binary only formats
1206	return 1;
1207	}
1208
1209	&util::rm("$output_filestem.text");
1210	return 0;
1211
1212	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: