Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 32224

Last change on this file since 32224 was 32224, checked in by ak19, 6 years ago
Adding PDF to text support for Windows using Xpdf's pdftotext tool. Previously PDFPlugin would override the output mode to HTML on Windows if txt output mode was selected.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 43.6 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use strict;
52
53	use parsargv;
54	use util;
55	use FileUtils;
56	use Cwd;
57
58	# Are we running on WinNT or Win2000 (or later)?
59	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62	my $use_strings;
63	my $pdf_complex;
64	my $pdf_nohidden;
65	my $pdf_zoom;
66	my $pdf_ignore_images;
67	my $pdf_allow_images_only;
68	my $windows_scripting;
69	my $enc;
70
71	sub print_usage
72	{
73	print STDERR "\n";
74	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
75	print STDERR " or text using third-party programs.\n\n";
76	print STDERR " usage: $0 [options] filename\n";
77	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
78	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
79	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
80	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
81	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
82	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
83	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
84	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
85	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
86	print STDERR "\t\tconverting PDF to HTML\n";
87	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
88	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
89	print STDERR "\t\t-pdf_complex is set\n";
90	exit(1);
91	}
92
93	my $faillogfile="";
94	my $timeout=0;
95	my $verbosity=0;
96
97	sub main
98	{
99	my (@ARGV) = @_;
100	my ($input_type,$output_type,$verbose);
101
102	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
103	# is in use or not
104	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
105	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
106	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
107	# Currently only have VBA for Word and PPT(but no XLS)
108	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
109
110	my $type_re = $default_type_re;
111
112	foreach my $a (@ARGV) {
113	if ($a =~ m/^windows_scripting$/i) {
114	$type_re = $enhanced_type_re;
115	}
116	}
117
118	# read command-line arguments
119	if (!parsargv::parse(\@ARGV,
120	"type/$type_re/", \$input_type,
121	'/errlog/.*/', \$faillogfile,
122	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
123	'timeout/\d+/0',\$timeout,
124	'verbose/\d+/0', \$verbose,
125	'windows_scripting',\$windows_scripting,
126	'use_strings', \$use_strings,
127	'pdf_complex', \$pdf_complex,
128	'pdf_ignore_images', \$pdf_ignore_images,
129	'pdf_allow_images_only', \$pdf_allow_images_only,
130	'pdf_nohidden', \$pdf_nohidden,
131	'pdf_zoom/\d+/2', \$pdf_zoom
132	))
133	{
134	print_usage();
135	}
136
137	$verbosity=$verbose if defined $verbose;
138
139	# Make sure the input file exists and can be opened for reading
140	if (scalar(@ARGV!=1)) {
141	print_usage();
142	}
143
144	my $input_filename = $ARGV[0];
145	if (!-r $input_filename) {
146	print STDERR "Error: unable to open $input_filename for reading\n";
147	exit(1);
148	}
149
150	# Deduce filenames
151	my ($tailname,$dirname,$suffix)
152	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
153	my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
154
155	if ($input_type eq "")
156	{
157	$input_type = lc (substr($suffix,1,length($suffix)-1));
158	}
159
160	# Change to temporary working directory
161	my $stored_dir = cwd();
162	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
163
164	# Select convert utility
165	if (!defined $input_type) {
166	print STDERR "Error: No filename extension or input type defined\n";
167	exit(1);
168	}
169	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
170	print &convertDOC($input_filename, $output_filestem, $output_type);
171	print "\n";
172	}
173	elsif ($input_type eq "rtf") {
174	print &convertRTF($input_filename, $output_filestem, $output_type);
175	print "\n";
176	}
177	elsif ($input_type eq "pdf") {
178	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
179	print "\n";
180	}
181	elsif ($input_type eq "ps") {
182	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
183	print "\n";
184	}
185	elsif ($input_type =~ m/pptx?$/) {
186	print &convertPPT($input_filename, $output_filestem, $output_type);
187	print "\n";
188	}
189	elsif ($input_type =~ m/xlsx?$/) {
190	print &convertXLS($input_filename, $output_filestem, $output_type);
191	print "\n";
192	}
193	else {
194	print STDERR "Error: Unable to convert type '$input_type'\n";
195	exit(1);
196	}
197
198	# restore to original working directory
199	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
200
201	}
202
203	&main(@ARGV);
204
205
206
207	# Document-type conversion functions
208	#
209	# The following functions attempt to convert documents from their
210	# input type to the specified output type. If no output type was
211	# given, then they first attempt HTML, and then TEXT.
212	#
213	# Each returns the output type ("html" or "text") or "fail" if no
214	# conversion is possible.
215
216	# Convert a Microsoft word document
217
218	sub convertDOC {
219	my ($input_filename, $output_filestem, $output_type) = @_;
220
221	# Many .doc files are not in fact word documents!
222	my $realtype = &find_docfile_type($input_filename);
223
224	if ($realtype eq "word6" \|\| $realtype eq "word7"
225	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
226	return &convertWord678($input_filename, $output_filestem, $output_type);
227	} elsif ($realtype eq "rtf") {
228	return &convertRTF($input_filename, $output_filestem, $output_type);
229	} else {
230	return &convertAnything($input_filename, $output_filestem, $output_type);
231	}
232	}
233
234	# Convert a Microsoft word 6/7/8 document
235
236	sub convertWord678 {
237	my ($input_filename, $output_filestem, $output_type) = @_;
238
239	my $success = 0;
240	if (!$output_type \|\| ($output_type =~ m/html/i)){
241	if ($windows_scripting) {
242	$success = &native_doc_to_html($input_filename, $output_filestem);
243	}
244	else {
245	$success = &doc_to_html($input_filename, $output_filestem);
246	}
247	if ($success) {
248	return "html";
249	}
250	}
251	return &convertAnything($input_filename, $output_filestem, $output_type);
252	}
253
254
255	# Convert a Rich Text Format (RTF) file
256
257	sub convertRTF {
258	my ($input_filename, $output_filestem, $output_type) = @_;
259
260	my $success = 0;
261
262	# Attempt specialised conversion to HTML
263	if (!$output_type \|\| ($output_type =~ m/html/i)) {
264
265	if ($windows_scripting) {
266	$success = &native_doc_to_html($input_filename, $output_filestem);
267	}
268	else {
269	$success = &rtf_to_html($input_filename, $output_filestem);
270	}
271	if ($success) {
272	return "html";
273	}
274	}
275
276	# rtf is so ugly that's it's not worth running strings over.
277	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
278	# return &convertAnything($input_filename, $output_filestem, $output_type);
279	return "fail";
280	}
281
282
283	# Convert an unidentified file
284
285	sub convertAnything {
286	my ($input_filename, $output_filestem, $output_type) = @_;
287
288	my $success = 0;
289
290	# Attempt simple conversion to HTML
291	if (!$output_type \|\| ($output_type =~ m/html/i)) {
292	$success = &any_to_html($input_filename, $output_filestem);
293	if ($success) {
294	return "html";
295	}
296	}
297
298	# Convert to text
299	if (!$output_type \|\| ($output_type =~ m/text/i)) {
300	$success = &any_to_text($input_filename, $output_filestem);
301	if ($success) {
302	return "text";
303	}
304	}
305	return "fail";
306	}
307
308
309
310	# Convert an Adobe PDF document
311
312	sub convertPDF {
313	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
314
315	my $success = 0;
316	$output_type =~ s/.\-(.)/$1/i;
317	# Attempt coversion to Image
318	if ($output_type =~ m/jp?g\|gif\|png/i) {
319	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
320	if ($success){
321	return "item";
322	}
323	}
324
325	# Attempt conversion to HTML
326	# Uses the old pdftohtml that doesn't work for newer PDF versions
327	if ($output_type =~ m/^html/i) {
328	#if (!$output_type \|\| ($output_type =~ m/^html/i)) {
329	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
330	if ($success) {
331	return "html";
332	}
333	}
334
335	# Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This
336	# will be the new default for PDFs when output_type for PDF docs is not specified
337	# (once our use of xpdftools' pdftohtml has been implemented on win and mac).
338	#if ($output_type =~ m/paged_html/i) {
339	if (!$output_type \|\| ($output_type =~ m/paged_html/i)) {
340	$success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
341	if ($success) {
342	return "paged_html";
343	}
344	}
345
346	# Attempt conversion to TEXT
347	if (!$output_type \|\| ($output_type =~ m/text/i)) {
348	if ($ENV{'GSDLOS'} =~ m/^windows$/i) { # we now have pdf to text support for windows by using xpdf tools
349	$success = &xpdf_to_text($dirname, $input_filename, $output_filestem);
350	} else {
351	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
352	}
353	if ($success) {
354	return "text";
355	}
356	}
357
358	return "fail";
359
360	}
361
362
363	# Convert an Adobe PostScript document
364
365	sub convertPS {
366	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
367
368	my $success = 0;
369	$output_type =~ s/.\-(.)/$1/i;
370	# Attempt coversion to Image
371	if ($output_type =~ m/jp?g\|gif\|png/i) {
372	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
373	if ($success){
374	return "item";
375	}
376	}
377
378	# Attempt conversion to TEXT
379	if (!$output_type \|\| ($output_type =~ m/text/i)) {
380	$success = &ps_to_text($input_filename, $output_filestem);
381	if ($success) {
382	return "text";
383	}
384	}
385	return "fail";
386	}
387
388
389	sub convertPPT {
390	my ($input_filename, $output_filestem, $output_type) = @_;
391	my $success = 0;
392
393	my $ppt_convert_type = "";
394
395	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
396	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
397	if ($output_type =~ m/gif/i) {
398	$ppt_convert_type = "-g";
399	} elsif ($output_type =~ m/jp?g/i){
400	$ppt_convert_type = "-j";
401	} elsif ($output_type =~ m/png/i){
402	$ppt_convert_type = "-p";
403	}
404	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
405	$ENV{'GSDLOS'}, "pptextract");
406	$vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
407	# $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
408
409	my $cmd = "";
410	if ($timeout) {$cmd = "ulimit -t $timeout;";}
411	# if the converting directory already exists
412	if (-d $output_filestem) {
413	print STDERR "**The conversion directory already exists\n";
414	return "item";
415	} else {
416	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
417	$cmd .= " 2>\"$output_filestem.err\""
418	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
419
420	if (system($cmd) !=0) {
421	print STDERR "Powerpoint VB Scripting convert failed\n";
422	} else {
423	return "item";
424	}
425	}
426	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
427	# Attempt conversion to HTML
428	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
429	# formulate the command
430	my $cmd = "";
431	my $full_perl_path = &util::get_perl_exec();
432	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
433	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
434	$cmd .= " 2>\"$output_filestem.err\""
435	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
436
437	# execute the command
438	$!=0;
439	if (system($cmd)!=0)
440	{
441	print STDERR "Powerpoint 95/97 converter failed $!\n";
442	} else {
443	return "html";
444	}
445	}
446
447	$success = &any_to_text($input_filename, $output_filestem);
448	if ($success) {
449	return "text";
450	}
451
452	return "fail";
453	}
454
455
456	sub convertXLS {
457	my ($input_filename, $output_filestem, $output_type) = @_;
458
459	my $success = 0;
460
461	# Attempt conversion to HTML
462	if (!$output_type \|\| ($output_type =~ m/html/i)) {
463	# formulate the command
464	my $cmd = "";
465	my $full_perl_path = &util::get_perl_exec();
466	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
467	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
468	$cmd .= " 2>\"$output_filestem.err\""
469	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
470
471
472	# execute the command
473	$!=0;
474	if (system($cmd)!=0)
475	{
476	print STDERR "Excel 95/97 converter failed $!\n";
477	} else {
478	return "html";
479	}
480	}
481
482	$success = &any_to_text($input_filename, $output_filestem);
483	if ($success) {
484	return "text";
485	}
486
487	return "fail";
488	}
489
490
491
492	# Find the real type of a .doc file
493	#
494	# We seem to have a lot of files with a .doc extension that are .rtf
495	# files or Word 5 files. This function attempts to tell the difference.
496	sub find_docfile_type {
497	my ($input_filename) = @_;
498
499	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
500	return "docx";
501	}
502
503	open(CHK, "<$input_filename");
504	binmode(CHK);
505	my $line = "";
506	my $first = 1;
507
508	while (<CHK>) {
509
510	$line = $_;
511
512	if ($first) {
513	# check to see if this is an rtf file
514	if ($line =~ m/^\{\\rtf/) {
515	close(CHK);
516	return "rtf";
517	}
518	$first = 0;
519	}
520
521	# is this is a word 6/7/8 document?
522	if ($line =~ m/Word\.Document\.([678])/) {
523	close(CHK);
524
525	return "word$1";
526	}
527
528	}
529
530	return "unknown";
531	}
532
533
534	# Specific type-to-type conversions
535	#
536	# Each of the following functions attempts to convert a document from
537	# a specific format to another. If they succeed they return 1 and leave
538	# the output document(s) in the appropriate place; if they fail they
539	# return 0 and delete any working files.
540
541
542	# Attempt to convert a word document to html with the wv program
543	sub doc_to_html {
544	my ($input_filename, $output_filestem) = @_;
545
546	my $wvware_status = 0;
547
548	# need to ensure that the path to perl is quoted (in case there's spaces in it)
549	my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
550
551	# print STDERR "***** wvware launch cmd = $launch_cmd\n";
552
553	$wvware_status = system($launch_cmd)/256;
554	return $wvware_status;
555	}
556
557	# Attempt to convert a word document to html with the word2html scripting program
558	sub native_doc_to_html {
559	my ($input_filename, $output_filestem) = @_;
560
561	# build up the path to the doc-to-html conversion tool we're going to use
562	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
563
564	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
565	# if windows scripting with docx input, use new VBscript to get the local Word install (if
566	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
567
568	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
569	# else script launch fails when there are error msgs
570	$vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
571	$vbScript = "CScript //Nologo \"$vbScript\""; # launch with CScript for error output in STDERR
572	# //Nologo flag avoids Microsoft's opening/logo msgs
573	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
574	print STDERR " This may take some time. Please wait...\n";
575	}
576	else { # old doc versions. use the usual VB executable word2html for the
577	# conversion. Doesn't need full path, since bin\windows is on PATH
578	$vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
579	}
580	}
581	else { # not windows
582	$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
583	}
584
585	if (-e "$output_filestem.html") {
586	print STDERR " The conversion file:\n";
587	print STDERR " $output_filestem.html\n";
588	print STDERR " ... already exists. Skipping\n";
589	return 1;
590	}
591
592	my $cmd = "";
593	if ($timeout) {$cmd = "ulimit -t $timeout;";}
594	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
595	#$cmd .= "$vbScript $input_filename $output_filestem.html";
596	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
597
598	# redirecting STDERR
599
600	$cmd .= " 2> \"$output_filestem.err\""
601	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
602	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
603
604	# execute the command
605	$!=0;
606	if (system($cmd)!=0)
607	{
608	print STDERR "Error executing $vbScript converter:$!\n";
609	if (-s "$output_filestem.err") {
610	open (ERRFILE, "<$output_filestem.err");
611
612	my $write_to_fail_log=0;
613	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
614	{$write_to_fail_log=1;}
615
616	my $line;
617	while ($line=<ERRFILE>) {
618	if ($line =~ m/\w/) {
619	print STDERR "$line";
620	print FAILLOG "$line" if ($write_to_fail_log);
621	}
622	if ($line !~ m/startup error/) {next;}
623	print STDERR " (given an invalid .DOC file?)\n";
624	print FAILLOG " (given an invalid .DOC file?)\n"
625	if ($write_to_fail_log);
626
627	} # while ERRFILE
628	close FAILLOG if ($write_to_fail_log);
629	}
630	return 0; # we can try any_to_text
631	}
632
633	# Was the conversion successful?
634	if (-s "$output_filestem.html") {
635	open(TMP, "$output_filestem.html");
636	my $line = <TMP>;
637	close(TMP);
638	if ($line && $line =~ m/html/i) {
639	&FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
640	return 1;
641	}
642	}
643
644	# If here, an error of some sort occurred
645	&FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
646	if (-e "$output_filestem.err") {
647	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
648	open (ERRLOG,"$output_filestem.err");
649	while (<ERRLOG>) {print FAILLOG $_;}
650	close FAILLOG;
651	close ERRLOG;
652	}
653	&FileUtils::removeFiles("$output_filestem.err");
654	}
655	return 0;
656	}
657
658	# Attempt to convert an RTF document to html with rtftohtml
659	sub rtf_to_html {
660	my ($input_filename, $output_filestem) = @_;
661
662	# formulate the command
663	my $cmd = "";
664	if ($timeout) {$cmd = "ulimit -t $timeout;";}
665	$cmd .= "rtftohtml";
666	#$cmd .= "rtf-converter";
667
668	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
669
670	$cmd .= " 2>\"$output_filestem.err\""
671	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
672
673
674	# execute the command
675	$!=0;
676	if (system($cmd)!=0)
677	{
678	print STDERR "Error executing rtf converter $!\n";
679	# don't currently bother printing out error log...
680	# keep going, in case it still created an HTML file...
681	}
682
683	# Was the conversion successful?
684	my $was_successful=0;
685	if (-s "$output_filestem.html") {
686	# make sure we have some content other than header
687	open (HTML, "$output_filestem.html"); # what to do if fail?
688	my $line;
689	my $past_header=0;
690	while ($line=<HTML>) {
691
692	if ($past_header == 0) {
693	if ($line =~ m/<body>/) {$past_header=1;}
694	next;
695	}
696
697	$line =~ s/<[^>]+>//g;
698	if ($line =~ m/\w/ && $past_header) { # we found some content...
699	$was_successful=1;
700	last;
701	}
702	}
703	close HTML;
704	}
705
706	if ($was_successful) {
707	&FileUtils::removeFiles("$output_filestem.err")
708	if (-e "$output_filestem.err");
709	# insert the (modified) table of contents, if it exists.
710	if (-e "${output_filestem}_ToC.html") {
711	&FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
712	my $open_failed=0;
713	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
714	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
715	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
716
717	if ($open_failed) {
718	close HTMLSRC;
719	close TOC;
720	close HTML;
721	&FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
722	return 1;
723	}
724
725	# print out header info from src html.
726	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
727	print HTML "$_";
728	}
729
730	# print out table of contents, making links relative
731	<TOC>; <TOC>; # ignore first 2 lines
732	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
733	my $line;
734	while ($line=<TOC>) {
735	$line =~ s@</body></html>$@@i ; # only last line has this
736	# make link relative
737	$line =~ s@href=\"[^\#]+@href=\"@i;
738	print HTML $line;
739	}
740	close TOC;
741
742	# rest of html src
743	while (<HTMLSRC>) {
744	print HTML $_;
745	}
746	close HTMLSRC;
747	close HTML;
748
749	&FileUtils::removeFiles("${output_filestem}_ToC.html");
750	&FileUtils::removeFiles("${output_filestem}.src");
751	}
752	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
753	return 1; # success
754	}
755
756	if (-e "$output_filestem.err") {
757	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
758	{
759	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
760	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
761	print FAILLOG " (rtf file might be too recent):\n";
762	open (ERRLOG, "$output_filestem.err");
763	while (<ERRLOG>) {print FAILLOG $_;}
764	close ERRLOG;
765	close FAILLOG;
766	}
767	&FileUtils::removeFiles("$output_filestem.err");
768	}
769
770	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
771
772	return 0;
773	}
774
775
776	# Convert a pdf file to html with the old pdftohtml command
777	# which only works for older PDF versions
778	sub pdf_to_html {
779	my ($dirname, $input_filename, $output_filestem) = @_;
780
781	my $cmd = "";
782	if ($timeout) {$cmd = "ulimit -t $timeout;";}
783	my $full_perl_path = &util::get_perl_exec();
784	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
785	$cmd .= " -c" if ($pdf_complex);
786	$cmd .= " -i" if ($pdf_ignore_images);
787	$cmd .= " -a" if ($pdf_allow_images_only);
788	$cmd .= " -hidden" unless ($pdf_nohidden);
789	$cmd .= " \"$input_filename\" \"$output_filestem\"";
790
791	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
792	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
793	} else {
794	$cmd .= " > \"$output_filestem.err\"";
795	}
796
797	$!=0;
798
799	my $retval=system($cmd);
800	if ($retval!=0)
801	{
802	print STDERR "Error executing pdftohtml.pl";
803	if ($!) {print STDERR ": $!";}
804	print STDERR "\n";
805	}
806
807	# make sure the converter made something
808	if ($retval!=0 \|\| ! -s "$output_filestem.html")
809	{
810	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
811	# print out the converter's std err, if any
812	if (-s "$output_filestem.err") {
813	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
814	print STDERR "pdftohtml error log:\n";
815	while (<ERRLOG>) {
816	print STDERR "$_";
817	}
818	close ERRLOG;
819	}
820	#print STDERR "***********output filestem $output_filestem.html\n";
821	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
822	if (-e "$output_filestem.err") {
823	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
824	{
825	open (ERRLOG, "$output_filestem.err");
826	while (<ERRLOG>) {print FAILLOG $_;}
827	close ERRLOG;
828	close FAILLOG;
829	}
830	&FileUtils::removeFiles("$output_filestem.err");
831	}
832	return 0;
833	}
834
835	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
836	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
837	return 1;
838	}
839
840
841	# Convert a pdf file to html with the newer Xpdftools' pdftohtml
842	# This generates "paged HTML" where extracted, selectable text is positioned
843	# over screenshots of each page.
844	# Since xpdf's pdftohtml fails if the output dir already exists and for easier
845	# naming, the output files are created in a "pages" subdirectory of the tmp
846	# location parent of $output_filestem instead
847	sub xpdf_to_html {
848	my ($dirname, $input_filename, $output_filestem) = @_;
849
850	my $cmd = "";
851
852	# build up the path to the doc-to-html conversion tool we're going to use
853	my $xpdf_pdftohtml = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftohtml");
854
855	# We'll create the file by name $output_filestem during post-conversion processing.
856	# Note that Xpdf tools will only create its conversion products in a dir that does
857	# not yet exist. So we'll create this location as a subdir of the output_filestem's
858	# parent directory. The parent dir is the already generated tmp area for conversion. So:
859	# - tmpdir gs2build/tmp/<random-num> already exists at this stage
860	# - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
861	# - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
862	my ($tailname, $tmp_dirname, $suffix)
863	= &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
864	$tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
865
866	# xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
867	$cmd .= "\"$xpdf_pdftohtml\"";
868	$cmd .= " -z $pdf_zoom" if ($pdf_zoom);
869	# $cmd .= " -c" if ($pdf_complex);
870	# $cmd .= " -i" if ($pdf_ignore_images);
871	# $cmd .= " -a" if ($pdf_allow_images_only);
872	# $cmd .= " -hidden" unless ($pdf_nohidden);
873	$cmd .= " \"$input_filename\" \"$tmp_dirname\"";
874	#$cmd .= " \"$input_filename\" \"$output_filestem\"";
875
876	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
877	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
878	} else {
879	$cmd .= " > \"$output_filestem.err\"";
880	}
881
882	#print STDERR "@@@@ Running command: $cmd\n";
883
884	$!=0;
885	my $retval=system($cmd);
886	if ($retval!=0)
887	{
888	print STDERR "Error executing xpdf's pdftohtml tool";
889	if ($!) {print STDERR ": $!";}
890	print STDERR "\n";
891	}
892
893	# make sure the converter made something
894	if ($retval!=0 \|\| ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
895	{
896	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
897	# print out the converter's std err, if any
898	if (-s "$output_filestem.err") {
899	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
900	print STDERR "pdftohtml error log:\n";
901	while (<ERRLOG>) {
902	print STDERR "$_";
903	}
904	close ERRLOG;
905	}
906	#print STDERR "***********output filestem $output_filestem.html\n";
907	&FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
908	if (-e "$output_filestem.err") {
909	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
910	{
911	open (ERRLOG, "$output_filestem.err");
912	while (<ERRLOG>) {print FAILLOG $_;}
913	close ERRLOG;
914	close FAILLOG;
915	}
916	&FileUtils::removeFiles("$output_filestem.err");
917	}
918	return 0;
919	}
920
921	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
922	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
923	return 1;
924	}
925
926	# Returns the path to xpdf-tools's containing bin dir appropriate for this machine's OS and bitness
927	sub _get_xpdftools_bindir {
928
929	# build up the path to the containing bin dir of the xpdf conversion tool we're going to use
930	my $xpdf_tools_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools");
931
932	if($ENV{'GSDLOS'} =~ m/^windows$/i) { # For Windows, just use the 32 bit xpdf's pdftohtml as it works the same as the 64 bit
933	$xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin32");
934	} else { # unix (linux\|darwin), use the bin32/bin64 folder depending on the BITNESS env var
935
936	# Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since $ENV{'GSDLARCH'}
937	# isn't always set and has side-effects when it is set:
938	# $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit
939	# specific subdirectories exist in a greenstone installation.
940	# None of those locations need exist when xpdf-tools is installed with GS.
941	# So don't depend on GSDLARCH as forcing that to be exported has side-effects
942	if($ENV{'BITNESS'}) {
943	$xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin".$ENV{'BITNESS'});
944	} else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64?
945	$xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin32");
946	}
947	}
948
949	return $xpdf_tools_bin;
950	}
951
952	# Convert a pdf file to various types of image with the convert command
953
954	sub pdfps_to_img {
955	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
956
957	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
958	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
959	my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
960	$imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
961	my $result = `$imagick_cmd identify 2>&1`;
962
963	# Linux and Windows return different values for "program not found".
964	# Linux returns -1 and Windows 256 for "program not found". But once they're
965	# converted to signed values, it will be -1 for Linux and 1 for Windows.
966	# Whenever we test for return values other than 0, shift by 8 and perform
967	# unsigned to signed status conversion on $? to get expected range of return vals
968	# Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
969	# and then exits on that, by the time we get here, we need to do it again
970	my $status = $?;
971	$status >>= 8;
972	$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
973	if (($ENV{'GSDLOS'} ne "windows" && $status == -1) \|\| ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
974	# if ($status == -1 \|\| $status == 1) #if ($status == -1 \|\| $status == 256) {
975	#ImageMagick is not installed, thus the convert utility is not available.
976	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
977	return 0;
978	}
979	}
980
981	my $cmd = "";
982	if ($timeout) {$cmd = "ulimit -t $timeout;";}
983	$output_type =~ s/.\_(.)/$1/i;
984	my $full_perl_path = &util::get_perl_exec();
985	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
986	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
987	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
988	} else {
989	$cmd .= " > \"$output_filestem.err\"";
990	}
991
992	# don't include path on windows (to avoid having to play about
993	# with quoting when GSDLHOME might contain spaces) but assume
994	# that the PATH is set up correctly
995	$!=0;
996	my $retval=system($cmd);
997	if ($retval!=0)
998	{
999	print STDERR "Error executing pdfpstoimg.pl";
1000	if ($!) {print STDERR ": $!";}
1001	print STDERR "\n";
1002	}
1003
1004	#make sure the converter made something
1005	#if ($retval !=0) \|\| ! -s "$output_filestem")
1006	if ($retval !=0)
1007	{
1008	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1009	#print out the converter's std err, if any
1010	if (-s "$output_filestem.err") {
1011	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1012	print STDERR "pdfpstoimg error log:\n";
1013	while (<ERRLOG>) {
1014	print STDERR "$_";
1015	}
1016	close ERRLOG;
1017	}
1018	#&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
1019	if (-e "$output_filestem.err") {
1020	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1021	{
1022	open (ERRLOG, "$output_filestem.err");
1023	while (<ERRLOG>) {print FAILLOG $_;}
1024	close ERRLOG;
1025	close FAILLOG;
1026	}
1027	&FileUtils::removeFiles("$output_filestem.err");
1028	}
1029	return 0;
1030	}
1031	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1032	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1033	return 1;
1034	}
1035
1036	# Convert a PDF file to text with xpdftools' pdftotext command
1037	# Works for Windows too, whereas the old pdftotxt didn't
1038	sub xpdf_to_text {
1039	my ($dirname, $input_filename, $output_filestem) = @_;
1040
1041	my $cmd = "";
1042
1043	# build up the path to the doc-to-txt conversion tool we're going to use
1044	my $xpdf_pdftotxt = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftotext");
1045
1046	# For xpdf's pdftotxt options, see https://www.xpdfreader.com/pdftotext-man.html
1047	$cmd .= "\"$xpdf_pdftotxt\"";
1048	if($enc) {
1049	$cmd .= " -enc $enc"; # decode the bytes in the file using the designated encoding scheme
1050	} else {
1051	# as per https://www.xpdfreader.com/pdftotext-man.html
1052	# xpdf's pdftotxt defaults to using Latin-1 encoding, should we default to UTF-8?
1053	$cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html
1054	}
1055	$cmd .= " -nopgbrk";
1056	# Avoid the silly solitary carriage returns (CR in Notepad) at the end
1057	# of lines that ends up as \n appended to the doc title
1058	# by setting the end of line marker to unix style solitary newline (LF or \n),
1059	# which doesn't end up in the doc title
1060	$cmd .= " -eol unix";
1061	$cmd .= " \"$input_filename\" \"$output_filestem.text\"";
1062
1063	print STDERR "@@@@ Running command: $cmd\n";
1064
1065	return _run_pdf_to_text_cmd($cmd, $output_filestem);
1066	}
1067
1068	# Convert a PDF file to text with the pdftotext command
1069
1070	sub pdf_to_text {
1071	my ($dirname, $input_filename, $output_filestem) = @_;
1072
1073	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1074
1075	return _run_pdf_to_text_cmd($cmd, $output_filestem);
1076	}
1077
1078	sub _run_pdf_to_text_cmd {
1079	my ($cmd, $output_filestem) = @_;
1080
1081	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1082	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1083	} else {
1084	$cmd .= " > \"$output_filestem.err\"";
1085	}
1086
1087	if (system($cmd)!=0)
1088	{
1089	print STDERR "Error executing $cmd: $!\n";
1090	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1091	}
1092
1093	# make sure there is some extracted text.
1094	if (-e "$output_filestem.text") {
1095	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
1096	binmode(EXTR_TEXT); # just in case...
1097	my $line="";
1098	my $seen_text=0;
1099	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1100	if ($line=~ m/\w/) {$seen_text=1;}
1101	}
1102	close EXTR_TEXT;
1103	if ($seen_text==0) { # no text was extracted
1104	print STDERR "Error: pdftotext found no text\n";
1105	&FileUtils::removeFiles("$output_filestem.text");
1106	}
1107	}
1108
1109	# make sure the converter made something
1110	if (! -s "$output_filestem.text")
1111	{
1112	# print out the converters std err, if any
1113	if (-s "$output_filestem.err") {
1114	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1115	print STDERR "pdftotext error log:\n";
1116	while (<ERRLOG>) {
1117	print STDERR "$_";
1118	}
1119	close ERRLOG;
1120	}
1121	# does this converter create a .out file?
1122	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1123	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1124	if (-e "$output_filestem.err") {
1125	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1126	{
1127	open (ERRLOG,"$output_filestem.err");
1128	while (<ERRLOG>) {print FAILLOG $_;}
1129	close ERRLOG;
1130	close FAILLOG;
1131	}
1132	&FileUtils::removeFiles("$output_filestem.err");
1133	}
1134	return 0;
1135	}
1136	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1137	return 1;
1138	}
1139
1140	# Convert a PostScript document to text
1141	# note - just using "ps2ascii" isn't good enough, as it
1142	# returns 0 for a postscript interpreter error. ps2ascii is just
1143	# a wrapper to "gs" anyway, so we use that cmd here.
1144
1145	sub ps_to_text {
1146	my ($input_filename, $output_filestem) = @_;
1147
1148	my $error = "";
1149
1150	# if we're on windows we'll fall straight through without attempting
1151	# to use gs
1152	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1153	$error = "Windows does not support gs";
1154
1155	} else {
1156	my $cmd = "";
1157	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1158	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1159	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1160	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1161	$cmd .= " 2> $output_filestem.err";
1162	$!=0;
1163
1164	my $retcode=system($cmd);
1165	$retcode = $? >> 8; # see man perlfunc - system for this...
1166	# if system returns -1 \| 127 (couldn't start program), look at $! for message
1167
1168	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1169	elsif (! -e "$output_filestem.text") {
1170	$error="did not create output file.\n";
1171	}
1172	else
1173	{ # make sure the interpreter didn't get an error. It is technically
1174	# possible for the actual text to start with this, but....
1175	open PSOUT, "$output_filestem.text";
1176	if (<PSOUT> =~ m/^Error: (.*)/) {
1177	$error="interpreter error - \"$1\"";
1178	}
1179	close PSOUT;
1180	}
1181	}
1182
1183	if ($error ne "")
1184	{
1185	print STDERR "Warning: Error executing gs: $error\n";
1186	print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
1187	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1188
1189	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1190	{
1191	print FAILLOG "gs - $error\n";
1192	if (-e "$output_filestem.err") {
1193	open(ERRLOG, "$output_filestem.err");
1194	while (<ERRLOG>) {print FAILLOG $_;}
1195	close ERRLOG;
1196	}
1197	close FAILLOG;
1198	}
1199	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1200
1201
1202	# Fine then. We'll just do a lousy job by ourselves...
1203	# Based on 5-line regexp sed script found at:
1204	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1205	#
1206	print STDERR "Stripping text from postscript\n";
1207	my $errorcode=0;
1208	open (IN, "$input_filename")
1209	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1210	open (OUT, ">$output_filestem.text")
1211	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1212	if ($errorcode) {print STDERR "errors\n";return 0;}
1213
1214	my $text=""; # this is for whole .ps file...
1215	$text = join('', <IN>); # see man perlport, under "System Resources"
1216	close IN;
1217
1218	# Make sure this is a ps file...
1219	if ($text !~ m/^%!/) {
1220	print STDERR "Bad postscript header: not '%!'\n";
1221	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1222	{
1223	print FAILLOG "Bad postscript header: not '%!'\n";
1224	close FAILLOG;
1225	}
1226	return 0;
1227	}
1228
1229	# if ps has Page data, then use it to delete all stuff before it.
1230	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1231
1232	# remove all leading non-data stuff
1233	$text =~ s/^.*?\(//s;
1234
1235	# remove all newline chars for easier processing
1236	$text =~ s/\n//g;
1237
1238	# Big assumption here - assume that if any co-ordinates are
1239	# given, then we are at the end of a sentence.
1240	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1241
1242	# special characters--
1243	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1244
1245	# ? ps text formatting (eg italics?) ?
1246	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1247	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1248	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1249	# default - remove the rest
1250	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1251
1252	# attempt to add whitespace between words...
1253	# this is based purely on observation, and may be completely wrong...
1254	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1255	# eg I notice "b(" is sometimes NOT a space if preceded by a
1256	# negative number.
1257	$text =~ s/\)\d+ ?b\(/\) \( /g;
1258
1259	# change quoted braces to brackets
1260	$text =~ s/([^\\])\\\(/$1\{/g;
1261	$text =~ s/([^\\])\\\)/$1\}/g ;
1262
1263	# remove everything that is not between braces
1264	$text =~ s/\)([^\(\)])+?\(//sg ;
1265
1266	# remove any Trailer eof stuff.
1267	$text =~ s/\)[^\)]*$//sg;
1268
1269	### ligatures have special characters...
1270	$text =~ s/\\013/ff/g;
1271	$text =~ s/\\014/fi/g;
1272	$text =~ s/\\015/fl/g;
1273	$text =~ s/\\016/ffi/g;
1274	$text =~ s/\\214/fi/g;
1275	$text =~ s/\\215/fl/g;
1276	$text =~ s/\\017/\n\* /g; # asterisk?
1277	$text =~ s/\\023/\023/g; # e acute ('e)
1278	$text =~ s/\\177/\252/g; # u"
1279	# $text =~ s/ ?? /\344/g; # a"
1280
1281	print OUT "$text";
1282	close OUT;
1283	}
1284	# wrap the text - use a minimum length. ie, first space after this length.
1285	my $wrap_length=72;
1286	&FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1287	open INFILE, "$output_filestem.text.tmp" \|\|
1288	die "Couldn't open file: $!";
1289	open OUTFILE, ">$output_filestem.text" \|\|
1290	die "Couldn't open file for writing: $!";
1291	my $line="";
1292	while ($line=<INFILE>) {
1293	while (length($line)>0) {
1294	if (length($line)>$wrap_length) {
1295	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1296	print OUTFILE "$1\n";
1297	} else {
1298	print OUTFILE "$line";
1299	$line="";
1300	}
1301	}
1302	}
1303	close INFILE;
1304	close OUTFILE;
1305	&FileUtils::removeFiles("$output_filestem.text.tmp");
1306
1307	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1308	return 1;
1309	}
1310
1311
1312	# Convert any file to HTML with a crude perl implementation of the
1313	# UNIX strings command.
1314
1315	sub any_to_html {
1316	my ($input_filename, $output_filestem) = @_;
1317
1318	# First generate a text file
1319	return 0 unless (&any_to_text($input_filename, $output_filestem));
1320
1321	# create an HTML file from the text file
1322	open(TEXT, "<$output_filestem.text");
1323	open(HTML, ">$output_filestem.html");
1324
1325	print HTML "<html><head>\n";
1326	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1327	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1328	print HTML "</head><body>\n\n";
1329
1330	my $line;
1331	while ($line=<TEXT>) {
1332	$line =~ s/</</g;
1333	$line =~ s/>/>/g;
1334	if ($line =~ m/^\s*$/) {
1335	print HTML "<p>";
1336	} else {
1337	print HTML "<br> ", $line;
1338	}
1339	}
1340	print HTML "\n</body></html>\n";
1341
1342	close HTML;
1343	close TEXT;
1344
1345	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1346	return 1;
1347	}
1348
1349	# Convert any file to TEXT with a crude perl implementation of the
1350	# UNIX strings command.
1351	# Note - this assumes ascii charsets :( (jrm21)
1352
1353	sub any_to_text {
1354	my ($input_filename, $output_filestem) = @_;
1355
1356	if (!$use_strings) {
1357	return 0;
1358	}
1359
1360	print STDERR "\n** In any to text**\n\n";
1361	open(IN, "<$input_filename") \|\| return 0;
1362	binmode(IN);
1363	open(OUT, ">$output_filestem.text") \|\| return 0;
1364
1365	my ($line);
1366	my $output_line_count = 0;
1367	while (<IN>) {
1368	$line = $_;
1369
1370	# delete anything that isn't a printable character
1371	$line =~ s/[^\040-\176]+/\n/sg;
1372
1373	# delete any string less than 10 characters long
1374	$line =~ s/^.{0,9}$/\n/mg;
1375	while ($line =~ m/^.{1,9}$/m) {
1376	$line =~ s/^.{0,9}$/\n/mg;
1377	$line =~ s/\n+/\n/sg;
1378	}
1379
1380	# remove extraneous whitespace
1381	$line =~ s/\n+/\n/gs;
1382	$line =~ s/^\n//gs;
1383
1384	# output whatever is left
1385	if ($line =~ m/[^\n ]/) {
1386	print OUT $line;
1387	++$output_line_count;
1388	}
1389	}
1390
1391	close OUT;
1392	close IN;
1393
1394	if ($output_line_count) { # try to protect against binary only formats
1395	return 1;
1396	}
1397
1398	&FileUtils::removeFiles("$output_filestem.text");
1399	return 0;
1400
1401	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: