Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 30724

Last change on this file since 30724 was 30724, checked in by ak19, 8 years ago
Extra, informative message that Dr Bainbridge added to indicate it's not an error (but normal behaviour on Windows) for ghostscript/gs to not work on Windows.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 36.7 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use strict;
52
53	use parsargv;
54	use util;
55	use FileUtils;
56	use Cwd;
57
58	# Are we running on WinNT or Win2000 (or later)?
59	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62	my $use_strings;
63	my $pdf_complex;
64	my $pdf_nohidden;
65	my $pdf_zoom;
66	my $pdf_ignore_images;
67	my $pdf_allow_images_only;
68	my $windows_scripting;
69
70	sub print_usage
71	{
72	print STDERR "\n";
73	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
74	print STDERR " or text using third-party programs.\n\n";
75	print STDERR " usage: $0 [options] filename\n";
76	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
77	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
78	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
79	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
80	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
81	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
82	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
83	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
84	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
85	print STDERR "\t\tconverting PDF to HTML\n";
86	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
87	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
88	print STDERR "\t\t-pdf_complex is set\n";
89	exit(1);
90	}
91
92	my $faillogfile="";
93	my $timeout=0;
94	my $verbosity=0;
95
96	sub main
97	{
98	my (@ARGV) = @_;
99	my ($input_type,$output_type,$verbose);
100
101	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
102	# is in use or not
103	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
104	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
105	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
106	# Currently only have VBA for Word and PPT(but no XLS)
107	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
108
109	my $type_re = $default_type_re;
110
111	foreach my $a (@ARGV) {
112	if ($a =~ m/^windows_scripting$/i) {
113	$type_re = $enhanced_type_re;
114	}
115	}
116
117	# read command-line arguments
118	if (!parsargv::parse(\@ARGV,
119	"type/$type_re/", \$input_type,
120	'/errlog/.*/', \$faillogfile,
121	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
122	'timeout/\d+/0',\$timeout,
123	'verbose/\d+/0', \$verbose,
124	'windows_scripting',\$windows_scripting,
125	'use_strings', \$use_strings,
126	'pdf_complex', \$pdf_complex,
127	'pdf_ignore_images', \$pdf_ignore_images,
128	'pdf_allow_images_only', \$pdf_allow_images_only,
129	'pdf_nohidden', \$pdf_nohidden,
130	'pdf_zoom/\d+/2', \$pdf_zoom
131	))
132	{
133	print_usage();
134	}
135
136	$verbosity=$verbose if defined $verbose;
137
138	# Make sure the input file exists and can be opened for reading
139	if (scalar(@ARGV!=1)) {
140	print_usage();
141	}
142
143	my $input_filename = $ARGV[0];
144	if (!-r $input_filename) {
145	print STDERR "Error: unable to open $input_filename for reading\n";
146	exit(1);
147	}
148
149	# Deduce filenames
150	my ($tailname,$dirname,$suffix)
151	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
152	my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
153
154	if ($input_type eq "")
155	{
156	$input_type = lc (substr($suffix,1,length($suffix)-1));
157	}
158
159	# Change to temporary working directory
160	my $stored_dir = cwd();
161	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
162
163	# Select convert utility
164	if (!defined $input_type) {
165	print STDERR "Error: No filename extension or input type defined\n";
166	exit(1);
167	}
168	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
169	print &convertDOC($input_filename, $output_filestem, $output_type);
170	print "\n";
171	}
172	elsif ($input_type eq "rtf") {
173	print &convertRTF($input_filename, $output_filestem, $output_type);
174	print "\n";
175	}
176	elsif ($input_type eq "pdf") {
177	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
178	print "\n";
179	}
180	elsif ($input_type eq "ps") {
181	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
182	print "\n";
183	}
184	elsif ($input_type =~ m/pptx?$/) {
185	print &convertPPT($input_filename, $output_filestem, $output_type);
186	print "\n";
187	}
188	elsif ($input_type =~ m/xlsx?$/) {
189	print &convertXLS($input_filename, $output_filestem, $output_type);
190	print "\n";
191	}
192	else {
193	print STDERR "Error: Unable to convert type '$input_type'\n";
194	exit(1);
195	}
196
197	# restore to original working directory
198	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
199
200	}
201
202	&main(@ARGV);
203
204
205
206	# Document-type conversion functions
207	#
208	# The following functions attempt to convert documents from their
209	# input type to the specified output type. If no output type was
210	# given, then they first attempt HTML, and then TEXT.
211	#
212	# Each returns the output type ("html" or "text") or "fail" if no
213	# conversion is possible.
214
215	# Convert a Microsoft word document
216
217	sub convertDOC {
218	my ($input_filename, $output_filestem, $output_type) = @_;
219
220	# Many .doc files are not in fact word documents!
221	my $realtype = &find_docfile_type($input_filename);
222
223	if ($realtype eq "word6" \|\| $realtype eq "word7"
224	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
225	return &convertWord678($input_filename, $output_filestem, $output_type);
226	} elsif ($realtype eq "rtf") {
227	return &convertRTF($input_filename, $output_filestem, $output_type);
228	} else {
229	return &convertAnything($input_filename, $output_filestem, $output_type);
230	}
231	}
232
233	# Convert a Microsoft word 6/7/8 document
234
235	sub convertWord678 {
236	my ($input_filename, $output_filestem, $output_type) = @_;
237
238	my $success = 0;
239	if (!$output_type \|\| ($output_type =~ m/html/i)){
240	if ($windows_scripting) {
241	$success = &native_doc_to_html($input_filename, $output_filestem);
242	}
243	else {
244	$success = &doc_to_html($input_filename, $output_filestem);
245	}
246	if ($success) {
247	return "html";
248	}
249	}
250	return &convertAnything($input_filename, $output_filestem, $output_type);
251	}
252
253
254	# Convert a Rich Text Format (RTF) file
255
256	sub convertRTF {
257	my ($input_filename, $output_filestem, $output_type) = @_;
258
259	my $success = 0;
260
261	# Attempt specialised conversion to HTML
262	if (!$output_type \|\| ($output_type =~ m/html/i)) {
263
264	if ($windows_scripting) {
265	$success = &native_doc_to_html($input_filename, $output_filestem);
266	}
267	else {
268	$success = &rtf_to_html($input_filename, $output_filestem);
269	}
270	if ($success) {
271	return "html";
272	}
273	}
274
275	# rtf is so ugly that's it's not worth running strings over.
276	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
277	# return &convertAnything($input_filename, $output_filestem, $output_type);
278	return "fail";
279	}
280
281
282	# Convert an unidentified file
283
284	sub convertAnything {
285	my ($input_filename, $output_filestem, $output_type) = @_;
286
287	my $success = 0;
288
289	# Attempt simple conversion to HTML
290	if (!$output_type \|\| ($output_type =~ m/html/i)) {
291	$success = &any_to_html($input_filename, $output_filestem);
292	if ($success) {
293	return "html";
294	}
295	}
296
297	# Convert to text
298	if (!$output_type \|\| ($output_type =~ m/text/i)) {
299	$success = &any_to_text($input_filename, $output_filestem);
300	if ($success) {
301	return "text";
302	}
303	}
304	return "fail";
305	}
306
307
308
309	# Convert an Adobe PDF document
310
311	sub convertPDF {
312	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
313
314	my $success = 0;
315	$output_type =~ s/.\-(.)/$1/i;
316	# Attempt coversion to Image
317	if ($output_type =~ m/jp?g\|gif\|png/i) {
318	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
319	if ($success){
320	return "item";
321	}
322	}
323
324	# Attempt conversion to HTML
325	if (!$output_type \|\| ($output_type =~ m/html/i)) {
326	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
327	if ($success) {
328	return "html";
329	}
330	}
331
332	# Attempt conversion to TEXT
333	if (!$output_type \|\| ($output_type =~ m/text/i)) {
334	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
335	if ($success) {
336	return "text";
337	}
338	}
339
340	return "fail";
341
342	}
343
344
345	# Convert an Adobe PostScript document
346
347	sub convertPS {
348	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
349
350	my $success = 0;
351	$output_type =~ s/.\-(.)/$1/i;
352	# Attempt coversion to Image
353	if ($output_type =~ m/jp?g\|gif\|png/i) {
354	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
355	if ($success){
356	return "item";
357	}
358	}
359
360	# Attempt conversion to TEXT
361	if (!$output_type \|\| ($output_type =~ m/text/i)) {
362	$success = &ps_to_text($input_filename, $output_filestem);
363	if ($success) {
364	return "text";
365	}
366	}
367	return "fail";
368	}
369
370
371	sub convertPPT {
372	my ($input_filename, $output_filestem, $output_type) = @_;
373	my $success = 0;
374
375	my $ppt_convert_type = "";
376
377	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
378	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
379	if ($output_type =~ m/gif/i) {
380	$ppt_convert_type = "-g";
381	} elsif ($output_type =~ m/jp?g/i){
382	$ppt_convert_type = "-j";
383	} elsif ($output_type =~ m/png/i){
384	$ppt_convert_type = "-p";
385	}
386	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
387	$ENV{'GSDLOS'}, "pptextract");
388	$vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
389	# $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
390
391	my $cmd = "";
392	if ($timeout) {$cmd = "ulimit -t $timeout;";}
393	# if the converting directory already exists
394	if (-d $output_filestem) {
395	print STDERR "**The conversion directory already exists\n";
396	return "item";
397	} else {
398	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
399	$cmd .= " 2>\"$output_filestem.err\""
400	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
401
402	if (system($cmd) !=0) {
403	print STDERR "Powerpoint VB Scripting convert failed\n";
404	} else {
405	return "item";
406	}
407	}
408	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
409	# Attempt conversion to HTML
410	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
411	# formulate the command
412	my $cmd = "";
413	my $full_perl_path = &util::get_perl_exec();
414	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
415	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
416	$cmd .= " 2>\"$output_filestem.err\""
417	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
418
419	# execute the command
420	$!=0;
421	if (system($cmd)!=0)
422	{
423	print STDERR "Powerpoint 95/97 converter failed $!\n";
424	} else {
425	return "html";
426	}
427	}
428
429	$success = &any_to_text($input_filename, $output_filestem);
430	if ($success) {
431	return "text";
432	}
433
434	return "fail";
435	}
436
437
438	sub convertXLS {
439	my ($input_filename, $output_filestem, $output_type) = @_;
440
441	my $success = 0;
442
443	# Attempt conversion to HTML
444	if (!$output_type \|\| ($output_type =~ m/html/i)) {
445	# formulate the command
446	my $cmd = "";
447	my $full_perl_path = &util::get_perl_exec();
448	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
449	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
450	$cmd .= " 2>\"$output_filestem.err\""
451	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
452
453
454	# execute the command
455	$!=0;
456	if (system($cmd)!=0)
457	{
458	print STDERR "Excel 95/97 converter failed $!\n";
459	} else {
460	return "html";
461	}
462	}
463
464	$success = &any_to_text($input_filename, $output_filestem);
465	if ($success) {
466	return "text";
467	}
468
469	return "fail";
470	}
471
472
473
474	# Find the real type of a .doc file
475	#
476	# We seem to have a lot of files with a .doc extension that are .rtf
477	# files or Word 5 files. This function attempts to tell the difference.
478	sub find_docfile_type {
479	my ($input_filename) = @_;
480
481	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
482	return "docx";
483	}
484
485	open(CHK, "<$input_filename");
486	binmode(CHK);
487	my $line = "";
488	my $first = 1;
489
490	while (<CHK>) {
491
492	$line = $_;
493
494	if ($first) {
495	# check to see if this is an rtf file
496	if ($line =~ m/^\{\\rtf/) {
497	close(CHK);
498	return "rtf";
499	}
500	$first = 0;
501	}
502
503	# is this is a word 6/7/8 document?
504	if ($line =~ m/Word\.Document\.([678])/) {
505	close(CHK);
506
507	return "word$1";
508	}
509
510	}
511
512	return "unknown";
513	}
514
515
516	# Specific type-to-type conversions
517	#
518	# Each of the following functions attempts to convert a document from
519	# a specific format to another. If they succeed they return 1 and leave
520	# the output document(s) in the appropriate place; if they fail they
521	# return 0 and delete any working files.
522
523
524	# Attempt to convert a word document to html with the wv program
525	sub doc_to_html {
526	my ($input_filename, $output_filestem) = @_;
527
528	my $wvware_status = 0;
529
530	# need to ensure that the path to perl is quoted (in case there's spaces in it)
531	my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
532
533	# print STDERR "***** wvware launch cmd = $launch_cmd\n";
534
535	$wvware_status = system($launch_cmd)/256;
536	return $wvware_status;
537	}
538
539	# Attempt to convert a word document to html with the word2html scripting program
540	sub native_doc_to_html {
541	my ($input_filename, $output_filestem) = @_;
542
543	# build up the path to the doc-to-html conversion tool we're going to use
544	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
545
546	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
547	# if windows scripting with docx input, use new VBscript to get the local Word install (if
548	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
549
550	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
551	# else script launch fails when there are error msgs
552	$vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
553	$vbScript = "CScript //Nologo \"$vbScript\""; # launch with CScript for error output in STDERR
554	# //Nologo flag avoids Microsoft's opening/logo msgs
555	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
556	print STDERR " This may take some time. Please wait...\n";
557	}
558	else { # old doc versions. use the usual VB executable word2html for the
559	# conversion. Doesn't need full path, since bin\windows is on PATH
560	$vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
561	}
562	}
563	else { # not windows
564	$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
565	}
566
567	if (-e "$output_filestem.html") {
568	print STDERR " The conversion file:\n";
569	print STDERR " $output_filestem.html\n";
570	print STDERR " ... already exists. Skipping\n";
571	return 1;
572	}
573
574	my $cmd = "";
575	if ($timeout) {$cmd = "ulimit -t $timeout;";}
576	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
577	#$cmd .= "$vbScript $input_filename $output_filestem.html";
578	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
579
580	# redirecting STDERR
581
582	$cmd .= " 2> \"$output_filestem.err\""
583	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
584	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
585
586	# execute the command
587	$!=0;
588	if (system($cmd)!=0)
589	{
590	print STDERR "Error executing $vbScript converter:$!\n";
591	if (-s "$output_filestem.err") {
592	open (ERRFILE, "<$output_filestem.err");
593
594	my $write_to_fail_log=0;
595	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
596	{$write_to_fail_log=1;}
597
598	my $line;
599	while ($line=<ERRFILE>) {
600	if ($line =~ m/\w/) {
601	print STDERR "$line";
602	print FAILLOG "$line" if ($write_to_fail_log);
603	}
604	if ($line !~ m/startup error/) {next;}
605	print STDERR " (given an invalid .DOC file?)\n";
606	print FAILLOG " (given an invalid .DOC file?)\n"
607	if ($write_to_fail_log);
608
609	} # while ERRFILE
610	close FAILLOG if ($write_to_fail_log);
611	}
612	return 0; # we can try any_to_text
613	}
614
615	# Was the conversion successful?
616	if (-s "$output_filestem.html") {
617	open(TMP, "$output_filestem.html");
618	my $line = <TMP>;
619	close(TMP);
620	if ($line && $line =~ m/html/i) {
621	&FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
622	return 1;
623	}
624	}
625
626	# If here, an error of some sort occurred
627	&FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
628	if (-e "$output_filestem.err") {
629	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
630	open (ERRLOG,"$output_filestem.err");
631	while (<ERRLOG>) {print FAILLOG $_;}
632	close FAILLOG;
633	close ERRLOG;
634	}
635	&FileUtils::removeFiles("$output_filestem.err");
636	}
637	return 0;
638	}
639
640	# Attempt to convert an RTF document to html with rtftohtml
641	sub rtf_to_html {
642	my ($input_filename, $output_filestem) = @_;
643
644	# formulate the command
645	my $cmd = "";
646	if ($timeout) {$cmd = "ulimit -t $timeout;";}
647	$cmd .= "rtftohtml";
648	#$cmd .= "rtf-converter";
649
650	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
651
652	$cmd .= " 2>\"$output_filestem.err\""
653	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
654
655
656	# execute the command
657	$!=0;
658	if (system($cmd)!=0)
659	{
660	print STDERR "Error executing rtf converter $!\n";
661	# don't currently bother printing out error log...
662	# keep going, in case it still created an HTML file...
663	}
664
665	# Was the conversion successful?
666	my $was_successful=0;
667	if (-s "$output_filestem.html") {
668	# make sure we have some content other than header
669	open (HTML, "$output_filestem.html"); # what to do if fail?
670	my $line;
671	my $past_header=0;
672	while ($line=<HTML>) {
673
674	if ($past_header == 0) {
675	if ($line =~ m/<body>/) {$past_header=1;}
676	next;
677	}
678
679	$line =~ s/<[^>]+>//g;
680	if ($line =~ m/\w/ && $past_header) { # we found some content...
681	$was_successful=1;
682	last;
683	}
684	}
685	close HTML;
686	}
687
688	if ($was_successful) {
689	&FileUtils::removeFiles("$output_filestem.err")
690	if (-e "$output_filestem.err");
691	# insert the (modified) table of contents, if it exists.
692	if (-e "${output_filestem}_ToC.html") {
693	&FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
694	my $open_failed=0;
695	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
696	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
697	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
698
699	if ($open_failed) {
700	close HTMLSRC;
701	close TOC;
702	close HTML;
703	&FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
704	return 1;
705	}
706
707	# print out header info from src html.
708	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
709	print HTML "$_";
710	}
711
712	# print out table of contents, making links relative
713	<TOC>; <TOC>; # ignore first 2 lines
714	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
715	my $line;
716	while ($line=<TOC>) {
717	$line =~ s@</body></html>$@@i ; # only last line has this
718	# make link relative
719	$line =~ s@href=\"[^\#]+@href=\"@i;
720	print HTML $line;
721	}
722	close TOC;
723
724	# rest of html src
725	while (<HTMLSRC>) {
726	print HTML $_;
727	}
728	close HTMLSRC;
729	close HTML;
730
731	&FileUtils::removeFiles("${output_filestem}_ToC.html");
732	&FileUtils::removeFiles("${output_filestem}.src");
733	}
734	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
735	return 1; # success
736	}
737
738	if (-e "$output_filestem.err") {
739	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
740	{
741	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
742	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
743	print FAILLOG " (rtf file might be too recent):\n";
744	open (ERRLOG, "$output_filestem.err");
745	while (<ERRLOG>) {print FAILLOG $_;}
746	close ERRLOG;
747	close FAILLOG;
748	}
749	&FileUtils::removeFiles("$output_filestem.err");
750	}
751
752	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
753
754	return 0;
755	}
756
757
758	# Convert a pdf file to html with the pdftohtml command
759
760	sub pdf_to_html {
761	my ($dirname, $input_filename, $output_filestem) = @_;
762
763	my $cmd = "";
764	if ($timeout) {$cmd = "ulimit -t $timeout;";}
765	my $full_perl_path = &util::get_perl_exec();
766	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
767	$cmd .= " -c" if ($pdf_complex);
768	$cmd .= " -i" if ($pdf_ignore_images);
769	$cmd .= " -a" if ($pdf_allow_images_only);
770	$cmd .= " -hidden" unless ($pdf_nohidden);
771	$cmd .= " \"$input_filename\" \"$output_filestem\"";
772
773	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
774	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
775	} else {
776	$cmd .= " > \"$output_filestem.err\"";
777	}
778
779	$!=0;
780
781	my $retval=system($cmd);
782	if ($retval!=0)
783	{
784	print STDERR "Error executing pdftohtml.pl";
785	if ($!) {print STDERR ": $!";}
786	print STDERR "\n";
787	}
788
789	# make sure the converter made something
790	if ($retval!=0 \|\| ! -s "$output_filestem.html")
791	{
792	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
793	# print out the converter's std err, if any
794	if (-s "$output_filestem.err") {
795	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
796	print STDERR "pdftohtml error log:\n";
797	while (<ERRLOG>) {
798	print STDERR "$_";
799	}
800	close ERRLOG;
801	}
802	#print STDERR "***********output filestem $output_filestem.html\n";
803	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
804	if (-e "$output_filestem.err") {
805	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
806	{
807	open (ERRLOG, "$output_filestem.err");
808	while (<ERRLOG>) {print FAILLOG $_;}
809	close ERRLOG;
810	close FAILLOG;
811	}
812	&FileUtils::removeFiles("$output_filestem.err");
813	}
814	return 0;
815	}
816
817	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
818	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
819	return 1;
820	}
821
822	# Convert a pdf file to various types of image with the convert command
823
824	sub pdfps_to_img {
825	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
826
827	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
828	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
829	my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
830	$imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
831	my $result = `$imagick_cmd identify 2>&1`;
832
833	# Linux and Windows return different values for "program not found".
834	# Linux returns -1 and Windows 256 for "program not found". But once they're
835	# converted to signed values, it will be -1 for Linux and 1 for Windows.
836	# Whenever we test for return values other than 0, shift by 8 and perform
837	# unsigned to signed status conversion on $? to get expected range of return vals
838	# Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
839	# and then exits on that, by the time we get here, we need to do it again
840	my $status = $?;
841	$status >>= 8;
842	$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
843	if (($ENV{'GSDLOS'} ne "windows" && $status == -1) \|\| ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
844	# if ($status == -1 \|\| $status == 1) #if ($status == -1 \|\| $status == 256) {
845	#ImageMagick is not installed, thus the convert utility is not available.
846	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
847	return 0;
848	}
849	}
850
851	my $cmd = "";
852	if ($timeout) {$cmd = "ulimit -t $timeout;";}
853	$output_type =~ s/.\_(.)/$1/i;
854	my $full_perl_path = &util::get_perl_exec();
855	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
856	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
857	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
858	} else {
859	$cmd .= " > \"$output_filestem.err\"";
860	}
861
862	# don't include path on windows (to avoid having to play about
863	# with quoting when GSDLHOME might contain spaces) but assume
864	# that the PATH is set up correctly
865	$!=0;
866	my $retval=system($cmd);
867	if ($retval!=0)
868	{
869	print STDERR "Error executing pdfpstoimg.pl";
870	if ($!) {print STDERR ": $!";}
871	print STDERR "\n";
872	}
873
874	#make sure the converter made something
875	#if ($retval !=0) \|\| ! -s "$output_filestem")
876	if ($retval !=0)
877	{
878	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
879	#print out the converter's std err, if any
880	if (-s "$output_filestem.err") {
881	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
882	print STDERR "pdfpstoimg error log:\n";
883	while (<ERRLOG>) {
884	print STDERR "$_";
885	}
886	close ERRLOG;
887	}
888	#&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
889	if (-e "$output_filestem.err") {
890	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
891	{
892	open (ERRLOG, "$output_filestem.err");
893	while (<ERRLOG>) {print FAILLOG $_;}
894	close ERRLOG;
895	close FAILLOG;
896	}
897	&FileUtils::removeFiles("$output_filestem.err");
898	}
899	return 0;
900	}
901	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
902	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
903	return 1;
904	}
905
906	# Convert a PDF file to text with the pdftotext command
907
908	sub pdf_to_text {
909	my ($dirname, $input_filename, $output_filestem) = @_;
910
911	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
912
913	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
914	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
915	} else {
916	$cmd .= " > \"$output_filestem.err\"";
917	}
918
919	if (system($cmd)!=0)
920	{
921	print STDERR "Error executing $cmd: $!\n";
922	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
923	}
924
925	# make sure there is some extracted text.
926	if (-e "$output_filestem.text") {
927	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
928	binmode(EXTR_TEXT); # just in case...
929	my $line="";
930	my $seen_text=0;
931	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
932	if ($line=~ m/\w/) {$seen_text=1;}
933	}
934	close EXTR_TEXT;
935	if ($seen_text==0) { # no text was extracted
936	print STDERR "Error: pdftotext found no text\n";
937	&FileUtils::removeFiles("$output_filestem.text");
938	}
939	}
940
941	# make sure the converter made something
942	if (! -s "$output_filestem.text")
943	{
944	# print out the converters std err, if any
945	if (-s "$output_filestem.err") {
946	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
947	print STDERR "pdftotext error log:\n";
948	while (<ERRLOG>) {
949	print STDERR "$_";
950	}
951	close ERRLOG;
952	}
953	# does this converter create a .out file?
954	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
955	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
956	if (-e "$output_filestem.err") {
957	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
958	{
959	open (ERRLOG,"$output_filestem.err");
960	while (<ERRLOG>) {print FAILLOG $_;}
961	close ERRLOG;
962	close FAILLOG;
963	}
964	&FileUtils::removeFiles("$output_filestem.err");
965	}
966	return 0;
967	}
968	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
969	return 1;
970	}
971
972	# Convert a PostScript document to text
973	# note - just using "ps2ascii" isn't good enough, as it
974	# returns 0 for a postscript interpreter error. ps2ascii is just
975	# a wrapper to "gs" anyway, so we use that cmd here.
976
977	sub ps_to_text {
978	my ($input_filename, $output_filestem) = @_;
979
980	my $error = "";
981
982	# if we're on windows we'll fall straight through without attempting
983	# to use gs
984	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
985	$error = "Windows does not support gs";
986
987	} else {
988	my $cmd = "";
989	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
990	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
991	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
992	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
993	$cmd .= " 2> $output_filestem.err";
994	$!=0;
995
996	my $retcode=system($cmd);
997	$retcode = $? >> 8; # see man perlfunc - system for this...
998	# if system returns -1 \| 127 (couldn't start program), look at $! for message
999
1000	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1001	elsif (! -e "$output_filestem.text") {
1002	$error="did not create output file.\n";
1003	}
1004	else
1005	{ # make sure the interpreter didn't get an error. It is technically
1006	# possible for the actual text to start with this, but....
1007	open PSOUT, "$output_filestem.text";
1008	if (<PSOUT> =~ m/^Error: (.*)/) {
1009	$error="interpreter error - \"$1\"";
1010	}
1011	close PSOUT;
1012	}
1013	}
1014
1015	if ($error ne "")
1016	{
1017	print STDERR "Warning: Error executing gs: $error\n";
1018	print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
1019	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1020
1021	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1022	{
1023	print FAILLOG "gs - $error\n";
1024	if (-e "$output_filestem.err") {
1025	open(ERRLOG, "$output_filestem.err");
1026	while (<ERRLOG>) {print FAILLOG $_;}
1027	close ERRLOG;
1028	}
1029	close FAILLOG;
1030	}
1031	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1032
1033
1034	# Fine then. We'll just do a lousy job by ourselves...
1035	# Based on 5-line regexp sed script found at:
1036	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1037	#
1038	print STDERR "Stripping text from postscript\n";
1039	my $errorcode=0;
1040	open (IN, "$input_filename")
1041	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1042	open (OUT, ">$output_filestem.text")
1043	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1044	if ($errorcode) {print STDERR "errors\n";return 0;}
1045
1046	my $text=""; # this is for whole .ps file...
1047	$text = join('', <IN>); # see man perlport, under "System Resources"
1048	close IN;
1049
1050	# Make sure this is a ps file...
1051	if ($text !~ m/^%!/) {
1052	print STDERR "Bad postscript header: not '%!'\n";
1053	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1054	{
1055	print FAILLOG "Bad postscript header: not '%!'\n";
1056	close FAILLOG;
1057	}
1058	return 0;
1059	}
1060
1061	# if ps has Page data, then use it to delete all stuff before it.
1062	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1063
1064	# remove all leading non-data stuff
1065	$text =~ s/^.*?\(//s;
1066
1067	# remove all newline chars for easier processing
1068	$text =~ s/\n//g;
1069
1070	# Big assumption here - assume that if any co-ordinates are
1071	# given, then we are at the end of a sentence.
1072	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1073
1074	# special characters--
1075	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1076
1077	# ? ps text formatting (eg italics?) ?
1078	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1079	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1080	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1081	# default - remove the rest
1082	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1083
1084	# attempt to add whitespace between words...
1085	# this is based purely on observation, and may be completely wrong...
1086	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1087	# eg I notice "b(" is sometimes NOT a space if preceded by a
1088	# negative number.
1089	$text =~ s/\)\d+ ?b\(/\) \( /g;
1090
1091	# change quoted braces to brackets
1092	$text =~ s/([^\\])\\\(/$1\{/g;
1093	$text =~ s/([^\\])\\\)/$1\}/g ;
1094
1095	# remove everything that is not between braces
1096	$text =~ s/\)([^\(\)])+?\(//sg ;
1097
1098	# remove any Trailer eof stuff.
1099	$text =~ s/\)[^\)]*$//sg;
1100
1101	### ligatures have special characters...
1102	$text =~ s/\\013/ff/g;
1103	$text =~ s/\\014/fi/g;
1104	$text =~ s/\\015/fl/g;
1105	$text =~ s/\\016/ffi/g;
1106	$text =~ s/\\214/fi/g;
1107	$text =~ s/\\215/fl/g;
1108	$text =~ s/\\017/\n\* /g; # asterisk?
1109	$text =~ s/\\023/\023/g; # e acute ('e)
1110	$text =~ s/\\177/\252/g; # u"
1111	# $text =~ s/ ?? /\344/g; # a"
1112
1113	print OUT "$text";
1114	close OUT;
1115	}
1116	# wrap the text - use a minimum length. ie, first space after this length.
1117	my $wrap_length=72;
1118	&FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1119	open INFILE, "$output_filestem.text.tmp" \|\|
1120	die "Couldn't open file: $!";
1121	open OUTFILE, ">$output_filestem.text" \|\|
1122	die "Couldn't open file for writing: $!";
1123	my $line="";
1124	while ($line=<INFILE>) {
1125	while (length($line)>0) {
1126	if (length($line)>$wrap_length) {
1127	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1128	print OUTFILE "$1\n";
1129	} else {
1130	print OUTFILE "$line";
1131	$line="";
1132	}
1133	}
1134	}
1135	close INFILE;
1136	close OUTFILE;
1137	&FileUtils::removeFiles("$output_filestem.text.tmp");
1138
1139	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1140	return 1;
1141	}
1142
1143
1144	# Convert any file to HTML with a crude perl implementation of the
1145	# UNIX strings command.
1146
1147	sub any_to_html {
1148	my ($input_filename, $output_filestem) = @_;
1149
1150	# First generate a text file
1151	return 0 unless (&any_to_text($input_filename, $output_filestem));
1152
1153	# create an HTML file from the text file
1154	open(TEXT, "<$output_filestem.text");
1155	open(HTML, ">$output_filestem.html");
1156
1157	print HTML "<html><head>\n";
1158	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1159	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1160	print HTML "</head><body>\n\n";
1161
1162	my $line;
1163	while ($line=<TEXT>) {
1164	$line =~ s/</</g;
1165	$line =~ s/>/>/g;
1166	if ($line =~ m/^\s*$/) {
1167	print HTML "<p>";
1168	} else {
1169	print HTML "<br> ", $line;
1170	}
1171	}
1172	print HTML "\n</body></html>\n";
1173
1174	close HTML;
1175	close TEXT;
1176
1177	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1178	return 1;
1179	}
1180
1181	# Convert any file to TEXT with a crude perl implementation of the
1182	# UNIX strings command.
1183	# Note - this assumes ascii charsets :( (jrm21)
1184
1185	sub any_to_text {
1186	my ($input_filename, $output_filestem) = @_;
1187
1188	if (!$use_strings) {
1189	return 0;
1190	}
1191
1192	print STDERR "\n** In any to text**\n\n";
1193	open(IN, "<$input_filename") \|\| return 0;
1194	binmode(IN);
1195	open(OUT, ">$output_filestem.text") \|\| return 0;
1196
1197	my ($line);
1198	my $output_line_count = 0;
1199	while (<IN>) {
1200	$line = $_;
1201
1202	# delete anything that isn't a printable character
1203	$line =~ s/[^\040-\176]+/\n/sg;
1204
1205	# delete any string less than 10 characters long
1206	$line =~ s/^.{0,9}$/\n/mg;
1207	while ($line =~ m/^.{1,9}$/m) {
1208	$line =~ s/^.{0,9}$/\n/mg;
1209	$line =~ s/\n+/\n/sg;
1210	}
1211
1212	# remove extraneous whitespace
1213	$line =~ s/\n+/\n/gs;
1214	$line =~ s/^\n//gs;
1215
1216	# output whatever is left
1217	if ($line =~ m/[^\n ]/) {
1218	print OUT $line;
1219	++$output_line_count;
1220	}
1221	}
1222
1223	close OUT;
1224	close IN;
1225
1226	if ($output_line_count) { # try to protect against binary only formats
1227	return 1;
1228	}
1229
1230	&FileUtils::removeFiles("$output_filestem.text");
1231	return 0;
1232
1233	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: