Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 32226

Last change on this file since 32226 was 32226, checked in by ak19, 6 years ago
Making xpdf_to_text, which uses xpdf-tools' pdftotext, the pdf to txt conversion tool for linux and mac as well. Recently used it for windows which had no prior PDF to txt conversion tool and used to output HTML. Since the introduction into GS of xpdf-tools, we can support newer pdf versions so using its pdftotxt as default tool to do PDF to txt conversions seems to be the way forward.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 43.7 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use strict;
52
53	use parsargv;
54	use util;
55	use FileUtils;
56	use Cwd;
57
58	# Are we running on WinNT or Win2000 (or later)?
59	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62	my $use_strings;
63	my $pdf_complex;
64	my $pdf_nohidden;
65	my $pdf_zoom;
66	my $pdf_ignore_images;
67	my $pdf_allow_images_only;
68	my $windows_scripting;
69	my $enc;
70
71	sub print_usage
72	{
73	print STDERR "\n";
74	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
75	print STDERR " or text using third-party programs.\n\n";
76	print STDERR " usage: $0 [options] filename\n";
77	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
78	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
79	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
80	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
81	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
82	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
83	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
84	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
85	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
86	print STDERR "\t\tconverting PDF to HTML\n";
87	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
88	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
89	print STDERR "\t\t-pdf_complex is set\n";
90	exit(1);
91	}
92
93	my $faillogfile="";
94	my $timeout=0;
95	my $verbosity=0;
96
97	sub main
98	{
99	my (@ARGV) = @_;
100	my ($input_type,$output_type,$verbose);
101
102	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
103	# is in use or not
104	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
105	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
106	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
107	# Currently only have VBA for Word and PPT(but no XLS)
108	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
109
110	my $type_re = $default_type_re;
111
112	foreach my $a (@ARGV) {
113	if ($a =~ m/^windows_scripting$/i) {
114	$type_re = $enhanced_type_re;
115	}
116	}
117
118	# read command-line arguments
119	if (!parsargv::parse(\@ARGV,
120	"type/$type_re/", \$input_type,
121	'/errlog/.*/', \$faillogfile,
122	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
123	'timeout/\d+/0',\$timeout,
124	'verbose/\d+/0', \$verbose,
125	'windows_scripting',\$windows_scripting,
126	'use_strings', \$use_strings,
127	'pdf_complex', \$pdf_complex,
128	'pdf_ignore_images', \$pdf_ignore_images,
129	'pdf_allow_images_only', \$pdf_allow_images_only,
130	'pdf_nohidden', \$pdf_nohidden,
131	'pdf_zoom/\d+/2', \$pdf_zoom
132	))
133	{
134	print_usage();
135	}
136
137	$verbosity=$verbose if defined $verbose;
138
139	# Make sure the input file exists and can be opened for reading
140	if (scalar(@ARGV!=1)) {
141	print_usage();
142	}
143
144	my $input_filename = $ARGV[0];
145	if (!-r $input_filename) {
146	print STDERR "Error: unable to open $input_filename for reading\n";
147	exit(1);
148	}
149
150	# Deduce filenames
151	my ($tailname,$dirname,$suffix)
152	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
153	my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
154
155	if ($input_type eq "")
156	{
157	$input_type = lc (substr($suffix,1,length($suffix)-1));
158	}
159
160	# Change to temporary working directory
161	my $stored_dir = cwd();
162	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
163
164	# Select convert utility
165	if (!defined $input_type) {
166	print STDERR "Error: No filename extension or input type defined\n";
167	exit(1);
168	}
169	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
170	print &convertDOC($input_filename, $output_filestem, $output_type);
171	print "\n";
172	}
173	elsif ($input_type eq "rtf") {
174	print &convertRTF($input_filename, $output_filestem, $output_type);
175	print "\n";
176	}
177	elsif ($input_type eq "pdf") {
178	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
179	print "\n";
180	}
181	elsif ($input_type eq "ps") {
182	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
183	print "\n";
184	}
185	elsif ($input_type =~ m/pptx?$/) {
186	print &convertPPT($input_filename, $output_filestem, $output_type);
187	print "\n";
188	}
189	elsif ($input_type =~ m/xlsx?$/) {
190	print &convertXLS($input_filename, $output_filestem, $output_type);
191	print "\n";
192	}
193	else {
194	print STDERR "Error: Unable to convert type '$input_type'\n";
195	exit(1);
196	}
197
198	# restore to original working directory
199	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
200
201	}
202
203	&main(@ARGV);
204
205
206
207	# Document-type conversion functions
208	#
209	# The following functions attempt to convert documents from their
210	# input type to the specified output type. If no output type was
211	# given, then they first attempt HTML, and then TEXT.
212	#
213	# Each returns the output type ("html" or "text") or "fail" if no
214	# conversion is possible.
215
216	# Convert a Microsoft word document
217
218	sub convertDOC {
219	my ($input_filename, $output_filestem, $output_type) = @_;
220
221	# Many .doc files are not in fact word documents!
222	my $realtype = &find_docfile_type($input_filename);
223
224	if ($realtype eq "word6" \|\| $realtype eq "word7"
225	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
226	return &convertWord678($input_filename, $output_filestem, $output_type);
227	} elsif ($realtype eq "rtf") {
228	return &convertRTF($input_filename, $output_filestem, $output_type);
229	} else {
230	return &convertAnything($input_filename, $output_filestem, $output_type);
231	}
232	}
233
234	# Convert a Microsoft word 6/7/8 document
235
236	sub convertWord678 {
237	my ($input_filename, $output_filestem, $output_type) = @_;
238
239	my $success = 0;
240	if (!$output_type \|\| ($output_type =~ m/html/i)){
241	if ($windows_scripting) {
242	$success = &native_doc_to_html($input_filename, $output_filestem);
243	}
244	else {
245	$success = &doc_to_html($input_filename, $output_filestem);
246	}
247	if ($success) {
248	return "html";
249	}
250	}
251	return &convertAnything($input_filename, $output_filestem, $output_type);
252	}
253
254
255	# Convert a Rich Text Format (RTF) file
256
257	sub convertRTF {
258	my ($input_filename, $output_filestem, $output_type) = @_;
259
260	my $success = 0;
261
262	# Attempt specialised conversion to HTML
263	if (!$output_type \|\| ($output_type =~ m/html/i)) {
264
265	if ($windows_scripting) {
266	$success = &native_doc_to_html($input_filename, $output_filestem);
267	}
268	else {
269	$success = &rtf_to_html($input_filename, $output_filestem);
270	}
271	if ($success) {
272	return "html";
273	}
274	}
275
276	# rtf is so ugly that's it's not worth running strings over.
277	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
278	# return &convertAnything($input_filename, $output_filestem, $output_type);
279	return "fail";
280	}
281
282
283	# Convert an unidentified file
284
285	sub convertAnything {
286	my ($input_filename, $output_filestem, $output_type) = @_;
287
288	my $success = 0;
289
290	# Attempt simple conversion to HTML
291	if (!$output_type \|\| ($output_type =~ m/html/i)) {
292	$success = &any_to_html($input_filename, $output_filestem);
293	if ($success) {
294	return "html";
295	}
296	}
297
298	# Convert to text
299	if (!$output_type \|\| ($output_type =~ m/text/i)) {
300	$success = &any_to_text($input_filename, $output_filestem);
301	if ($success) {
302	return "text";
303	}
304	}
305	return "fail";
306	}
307
308
309
310	# Convert an Adobe PDF document
311
312	sub convertPDF {
313	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
314
315	my $success = 0;
316	$output_type =~ s/.\-(.)/$1/i;
317	# Attempt coversion to Image
318	if ($output_type =~ m/jp?g\|gif\|png/i) {
319	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
320	if ($success){
321	return "item";
322	}
323	}
324
325	# Attempt conversion to HTML
326	# Uses the old pdftohtml that doesn't work for newer PDF versions
327	if ($output_type =~ m/^html/i) {
328	#if (!$output_type \|\| ($output_type =~ m/^html/i)) {
329	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
330	if ($success) {
331	return "html";
332	}
333	}
334
335	# Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This
336	# will be the new default for PDFs when output_type for PDF docs is not specified
337	# (once our use of xpdftools' pdftohtml has been implemented on win and mac).
338	#if ($output_type =~ m/paged_html/i) {
339	if (!$output_type \|\| ($output_type =~ m/paged_html/i)) {
340	$success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
341	if ($success) {
342	return "paged_html";
343	}
344	}
345
346	# Attempt conversion to TEXT
347	if (!$output_type \|\| ($output_type =~ m/text/i)) {
348	$success = &xpdf_to_text($dirname, $input_filename, $output_filestem);
349	#if ($ENV{'GSDLOS'} =~ m/^windows$/i) { # we now have pdf to text support for windows by using xpdf tools
350	# $success = &xpdf_to_text($dirname, $input_filename, $output_filestem);
351	#} else {
352	# $success = &pdf_to_text($dirname, $input_filename, $output_filestem);
353	#}
354	if ($success) {
355	return "text";
356	}
357	}
358
359	return "fail";
360
361	}
362
363
364	# Convert an Adobe PostScript document
365
366	sub convertPS {
367	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
368
369	my $success = 0;
370	$output_type =~ s/.\-(.)/$1/i;
371	# Attempt coversion to Image
372	if ($output_type =~ m/jp?g\|gif\|png/i) {
373	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
374	if ($success){
375	return "item";
376	}
377	}
378
379	# Attempt conversion to TEXT
380	if (!$output_type \|\| ($output_type =~ m/text/i)) {
381	$success = &ps_to_text($input_filename, $output_filestem);
382	if ($success) {
383	return "text";
384	}
385	}
386	return "fail";
387	}
388
389
390	sub convertPPT {
391	my ($input_filename, $output_filestem, $output_type) = @_;
392	my $success = 0;
393
394	my $ppt_convert_type = "";
395
396	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
397	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
398	if ($output_type =~ m/gif/i) {
399	$ppt_convert_type = "-g";
400	} elsif ($output_type =~ m/jp?g/i){
401	$ppt_convert_type = "-j";
402	} elsif ($output_type =~ m/png/i){
403	$ppt_convert_type = "-p";
404	}
405	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
406	$ENV{'GSDLOS'}, "pptextract");
407	$vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
408	# $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
409
410	my $cmd = "";
411	if ($timeout) {$cmd = "ulimit -t $timeout;";}
412	# if the converting directory already exists
413	if (-d $output_filestem) {
414	print STDERR "**The conversion directory already exists\n";
415	return "item";
416	} else {
417	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
418	$cmd .= " 2>\"$output_filestem.err\""
419	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
420
421	if (system($cmd) !=0) {
422	print STDERR "Powerpoint VB Scripting convert failed\n";
423	} else {
424	return "item";
425	}
426	}
427	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
428	# Attempt conversion to HTML
429	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
430	# formulate the command
431	my $cmd = "";
432	my $full_perl_path = &util::get_perl_exec();
433	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
434	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
435	$cmd .= " 2>\"$output_filestem.err\""
436	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
437
438	# execute the command
439	$!=0;
440	if (system($cmd)!=0)
441	{
442	print STDERR "Powerpoint 95/97 converter failed $!\n";
443	} else {
444	return "html";
445	}
446	}
447
448	$success = &any_to_text($input_filename, $output_filestem);
449	if ($success) {
450	return "text";
451	}
452
453	return "fail";
454	}
455
456
457	sub convertXLS {
458	my ($input_filename, $output_filestem, $output_type) = @_;
459
460	my $success = 0;
461
462	# Attempt conversion to HTML
463	if (!$output_type \|\| ($output_type =~ m/html/i)) {
464	# formulate the command
465	my $cmd = "";
466	my $full_perl_path = &util::get_perl_exec();
467	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
468	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
469	$cmd .= " 2>\"$output_filestem.err\""
470	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
471
472
473	# execute the command
474	$!=0;
475	if (system($cmd)!=0)
476	{
477	print STDERR "Excel 95/97 converter failed $!\n";
478	} else {
479	return "html";
480	}
481	}
482
483	$success = &any_to_text($input_filename, $output_filestem);
484	if ($success) {
485	return "text";
486	}
487
488	return "fail";
489	}
490
491
492
493	# Find the real type of a .doc file
494	#
495	# We seem to have a lot of files with a .doc extension that are .rtf
496	# files or Word 5 files. This function attempts to tell the difference.
497	sub find_docfile_type {
498	my ($input_filename) = @_;
499
500	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
501	return "docx";
502	}
503
504	open(CHK, "<$input_filename");
505	binmode(CHK);
506	my $line = "";
507	my $first = 1;
508
509	while (<CHK>) {
510
511	$line = $_;
512
513	if ($first) {
514	# check to see if this is an rtf file
515	if ($line =~ m/^\{\\rtf/) {
516	close(CHK);
517	return "rtf";
518	}
519	$first = 0;
520	}
521
522	# is this is a word 6/7/8 document?
523	if ($line =~ m/Word\.Document\.([678])/) {
524	close(CHK);
525
526	return "word$1";
527	}
528
529	}
530
531	return "unknown";
532	}
533
534
535	# Specific type-to-type conversions
536	#
537	# Each of the following functions attempts to convert a document from
538	# a specific format to another. If they succeed they return 1 and leave
539	# the output document(s) in the appropriate place; if they fail they
540	# return 0 and delete any working files.
541
542
543	# Attempt to convert a word document to html with the wv program
544	sub doc_to_html {
545	my ($input_filename, $output_filestem) = @_;
546
547	my $wvware_status = 0;
548
549	# need to ensure that the path to perl is quoted (in case there's spaces in it)
550	my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
551
552	# print STDERR "***** wvware launch cmd = $launch_cmd\n";
553
554	$wvware_status = system($launch_cmd)/256;
555	return $wvware_status;
556	}
557
558	# Attempt to convert a word document to html with the word2html scripting program
559	sub native_doc_to_html {
560	my ($input_filename, $output_filestem) = @_;
561
562	# build up the path to the doc-to-html conversion tool we're going to use
563	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
564
565	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
566	# if windows scripting with docx input, use new VBscript to get the local Word install (if
567	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
568
569	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
570	# else script launch fails when there are error msgs
571	$vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
572	$vbScript = "CScript //Nologo \"$vbScript\""; # launch with CScript for error output in STDERR
573	# //Nologo flag avoids Microsoft's opening/logo msgs
574	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
575	print STDERR " This may take some time. Please wait...\n";
576	}
577	else { # old doc versions. use the usual VB executable word2html for the
578	# conversion. Doesn't need full path, since bin\windows is on PATH
579	$vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
580	}
581	}
582	else { # not windows
583	$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
584	}
585
586	if (-e "$output_filestem.html") {
587	print STDERR " The conversion file:\n";
588	print STDERR " $output_filestem.html\n";
589	print STDERR " ... already exists. Skipping\n";
590	return 1;
591	}
592
593	my $cmd = "";
594	if ($timeout) {$cmd = "ulimit -t $timeout;";}
595	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
596	#$cmd .= "$vbScript $input_filename $output_filestem.html";
597	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
598
599	# redirecting STDERR
600
601	$cmd .= " 2> \"$output_filestem.err\""
602	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
603	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
604
605	# execute the command
606	$!=0;
607	if (system($cmd)!=0)
608	{
609	print STDERR "Error executing $vbScript converter:$!\n";
610	if (-s "$output_filestem.err") {
611	open (ERRFILE, "<$output_filestem.err");
612
613	my $write_to_fail_log=0;
614	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
615	{$write_to_fail_log=1;}
616
617	my $line;
618	while ($line=<ERRFILE>) {
619	if ($line =~ m/\w/) {
620	print STDERR "$line";
621	print FAILLOG "$line" if ($write_to_fail_log);
622	}
623	if ($line !~ m/startup error/) {next;}
624	print STDERR " (given an invalid .DOC file?)\n";
625	print FAILLOG " (given an invalid .DOC file?)\n"
626	if ($write_to_fail_log);
627
628	} # while ERRFILE
629	close FAILLOG if ($write_to_fail_log);
630	}
631	return 0; # we can try any_to_text
632	}
633
634	# Was the conversion successful?
635	if (-s "$output_filestem.html") {
636	open(TMP, "$output_filestem.html");
637	my $line = <TMP>;
638	close(TMP);
639	if ($line && $line =~ m/html/i) {
640	&FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
641	return 1;
642	}
643	}
644
645	# If here, an error of some sort occurred
646	&FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
647	if (-e "$output_filestem.err") {
648	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
649	open (ERRLOG,"$output_filestem.err");
650	while (<ERRLOG>) {print FAILLOG $_;}
651	close FAILLOG;
652	close ERRLOG;
653	}
654	&FileUtils::removeFiles("$output_filestem.err");
655	}
656	return 0;
657	}
658
659	# Attempt to convert an RTF document to html with rtftohtml
660	sub rtf_to_html {
661	my ($input_filename, $output_filestem) = @_;
662
663	# formulate the command
664	my $cmd = "";
665	if ($timeout) {$cmd = "ulimit -t $timeout;";}
666	$cmd .= "rtftohtml";
667	#$cmd .= "rtf-converter";
668
669	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
670
671	$cmd .= " 2>\"$output_filestem.err\""
672	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
673
674
675	# execute the command
676	$!=0;
677	if (system($cmd)!=0)
678	{
679	print STDERR "Error executing rtf converter $!\n";
680	# don't currently bother printing out error log...
681	# keep going, in case it still created an HTML file...
682	}
683
684	# Was the conversion successful?
685	my $was_successful=0;
686	if (-s "$output_filestem.html") {
687	# make sure we have some content other than header
688	open (HTML, "$output_filestem.html"); # what to do if fail?
689	my $line;
690	my $past_header=0;
691	while ($line=<HTML>) {
692
693	if ($past_header == 0) {
694	if ($line =~ m/<body>/) {$past_header=1;}
695	next;
696	}
697
698	$line =~ s/<[^>]+>//g;
699	if ($line =~ m/\w/ && $past_header) { # we found some content...
700	$was_successful=1;
701	last;
702	}
703	}
704	close HTML;
705	}
706
707	if ($was_successful) {
708	&FileUtils::removeFiles("$output_filestem.err")
709	if (-e "$output_filestem.err");
710	# insert the (modified) table of contents, if it exists.
711	if (-e "${output_filestem}_ToC.html") {
712	&FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
713	my $open_failed=0;
714	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
715	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
716	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
717
718	if ($open_failed) {
719	close HTMLSRC;
720	close TOC;
721	close HTML;
722	&FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
723	return 1;
724	}
725
726	# print out header info from src html.
727	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
728	print HTML "$_";
729	}
730
731	# print out table of contents, making links relative
732	<TOC>; <TOC>; # ignore first 2 lines
733	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
734	my $line;
735	while ($line=<TOC>) {
736	$line =~ s@</body></html>$@@i ; # only last line has this
737	# make link relative
738	$line =~ s@href=\"[^\#]+@href=\"@i;
739	print HTML $line;
740	}
741	close TOC;
742
743	# rest of html src
744	while (<HTMLSRC>) {
745	print HTML $_;
746	}
747	close HTMLSRC;
748	close HTML;
749
750	&FileUtils::removeFiles("${output_filestem}_ToC.html");
751	&FileUtils::removeFiles("${output_filestem}.src");
752	}
753	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
754	return 1; # success
755	}
756
757	if (-e "$output_filestem.err") {
758	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
759	{
760	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
761	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
762	print FAILLOG " (rtf file might be too recent):\n";
763	open (ERRLOG, "$output_filestem.err");
764	while (<ERRLOG>) {print FAILLOG $_;}
765	close ERRLOG;
766	close FAILLOG;
767	}
768	&FileUtils::removeFiles("$output_filestem.err");
769	}
770
771	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
772
773	return 0;
774	}
775
776
777	# Convert a pdf file to html with the old pdftohtml command
778	# which only works for older PDF versions
779	sub pdf_to_html {
780	my ($dirname, $input_filename, $output_filestem) = @_;
781
782	my $cmd = "";
783	if ($timeout) {$cmd = "ulimit -t $timeout;";}
784	my $full_perl_path = &util::get_perl_exec();
785	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
786	$cmd .= " -c" if ($pdf_complex);
787	$cmd .= " -i" if ($pdf_ignore_images);
788	$cmd .= " -a" if ($pdf_allow_images_only);
789	$cmd .= " -hidden" unless ($pdf_nohidden);
790	$cmd .= " \"$input_filename\" \"$output_filestem\"";
791
792	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
793	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
794	} else {
795	$cmd .= " > \"$output_filestem.err\"";
796	}
797
798	$!=0;
799
800	my $retval=system($cmd);
801	if ($retval!=0)
802	{
803	print STDERR "Error executing pdftohtml.pl";
804	if ($!) {print STDERR ": $!";}
805	print STDERR "\n";
806	}
807
808	# make sure the converter made something
809	if ($retval!=0 \|\| ! -s "$output_filestem.html")
810	{
811	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
812	# print out the converter's std err, if any
813	if (-s "$output_filestem.err") {
814	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
815	print STDERR "pdftohtml error log:\n";
816	while (<ERRLOG>) {
817	print STDERR "$_";
818	}
819	close ERRLOG;
820	}
821	#print STDERR "***********output filestem $output_filestem.html\n";
822	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
823	if (-e "$output_filestem.err") {
824	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
825	{
826	open (ERRLOG, "$output_filestem.err");
827	while (<ERRLOG>) {print FAILLOG $_;}
828	close ERRLOG;
829	close FAILLOG;
830	}
831	&FileUtils::removeFiles("$output_filestem.err");
832	}
833	return 0;
834	}
835
836	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
837	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
838	return 1;
839	}
840
841
842	# Convert a pdf file to html with the newer Xpdftools' pdftohtml
843	# This generates "paged HTML" where extracted, selectable text is positioned
844	# over screenshots of each page.
845	# Since xpdf's pdftohtml fails if the output dir already exists and for easier
846	# naming, the output files are created in a "pages" subdirectory of the tmp
847	# location parent of $output_filestem instead
848	sub xpdf_to_html {
849	my ($dirname, $input_filename, $output_filestem) = @_;
850
851	my $cmd = "";
852
853	# build up the path to the doc-to-html conversion tool we're going to use
854	my $xpdf_pdftohtml = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftohtml");
855
856	# We'll create the file by name $output_filestem during post-conversion processing.
857	# Note that Xpdf tools will only create its conversion products in a dir that does
858	# not yet exist. So we'll create this location as a subdir of the output_filestem's
859	# parent directory. The parent dir is the already generated tmp area for conversion. So:
860	# - tmpdir gs2build/tmp/<random-num> already exists at this stage
861	# - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
862	# - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
863	my ($tailname, $tmp_dirname, $suffix)
864	= &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
865	$tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
866
867	# xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
868	$cmd .= "\"$xpdf_pdftohtml\"";
869	$cmd .= " -z $pdf_zoom" if ($pdf_zoom);
870	# $cmd .= " -c" if ($pdf_complex);
871	# $cmd .= " -i" if ($pdf_ignore_images);
872	# $cmd .= " -a" if ($pdf_allow_images_only);
873	# $cmd .= " -hidden" unless ($pdf_nohidden);
874	$cmd .= " \"$input_filename\" \"$tmp_dirname\"";
875	#$cmd .= " \"$input_filename\" \"$output_filestem\"";
876
877	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
878	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
879	} else {
880	$cmd .= " > \"$output_filestem.err\"";
881	}
882
883	#print STDERR "@@@@ Running command: $cmd\n";
884
885	$!=0;
886	my $retval=system($cmd);
887	if ($retval!=0)
888	{
889	print STDERR "Error executing xpdf's pdftohtml tool";
890	if ($!) {print STDERR ": $!";}
891	print STDERR "\n";
892	}
893
894	# make sure the converter made something
895	if ($retval!=0 \|\| ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
896	{
897	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
898	# print out the converter's std err, if any
899	if (-s "$output_filestem.err") {
900	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
901	print STDERR "pdftohtml error log:\n";
902	while (<ERRLOG>) {
903	print STDERR "$_";
904	}
905	close ERRLOG;
906	}
907	#print STDERR "***********output filestem $output_filestem.html\n";
908	&FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
909	if (-e "$output_filestem.err") {
910	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
911	{
912	open (ERRLOG, "$output_filestem.err");
913	while (<ERRLOG>) {print FAILLOG $_;}
914	close ERRLOG;
915	close FAILLOG;
916	}
917	&FileUtils::removeFiles("$output_filestem.err");
918	}
919	return 0;
920	}
921
922	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
923	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
924	return 1;
925	}
926
927	# Returns the path to xpdf-tools's containing bin dir appropriate for this machine's OS and bitness
928	sub _get_xpdftools_bindir {
929
930	# build up the path to the containing bin dir of the xpdf conversion tool we're going to use
931	my $xpdf_tools_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools");
932
933	if($ENV{'GSDLOS'} =~ m/^windows$/i) { # For Windows, just use the 32 bit xpdf's pdftohtml as it works the same as the 64 bit
934	$xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin32");
935	} else { # unix (linux\|darwin), use the bin32/bin64 folder depending on the BITNESS env var
936
937	# Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since $ENV{'GSDLARCH'}
938	# isn't always set and has side-effects when it is set:
939	# $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit
940	# specific subdirectories exist in a greenstone installation.
941	# None of those locations need exist when xpdf-tools is installed with GS.
942	# So don't depend on GSDLARCH as forcing that to be exported has side-effects
943	if($ENV{'BITNESS'}) {
944	$xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin".$ENV{'BITNESS'});
945	} else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64?
946	$xpdf_tools_bin = &FileUtils::filenameConcatenate($xpdf_tools_bin, "bin32");
947	}
948	}
949
950	return $xpdf_tools_bin;
951	}
952
953	# Convert a pdf file to various types of image with the convert command
954
955	sub pdfps_to_img {
956	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
957
958	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
959	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
960	my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
961	$imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
962	my $result = `$imagick_cmd identify 2>&1`;
963
964	# Linux and Windows return different values for "program not found".
965	# Linux returns -1 and Windows 256 for "program not found". But once they're
966	# converted to signed values, it will be -1 for Linux and 1 for Windows.
967	# Whenever we test for return values other than 0, shift by 8 and perform
968	# unsigned to signed status conversion on $? to get expected range of return vals
969	# Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
970	# and then exits on that, by the time we get here, we need to do it again
971	my $status = $?;
972	$status >>= 8;
973	$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
974	if (($ENV{'GSDLOS'} ne "windows" && $status == -1) \|\| ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
975	# if ($status == -1 \|\| $status == 1) #if ($status == -1 \|\| $status == 256) {
976	#ImageMagick is not installed, thus the convert utility is not available.
977	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
978	return 0;
979	}
980	}
981
982	my $cmd = "";
983	if ($timeout) {$cmd = "ulimit -t $timeout;";}
984	$output_type =~ s/.\_(.)/$1/i;
985	my $full_perl_path = &util::get_perl_exec();
986	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
987	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
988	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
989	} else {
990	$cmd .= " > \"$output_filestem.err\"";
991	}
992
993	# don't include path on windows (to avoid having to play about
994	# with quoting when GSDLHOME might contain spaces) but assume
995	# that the PATH is set up correctly
996	$!=0;
997	my $retval=system($cmd);
998	if ($retval!=0)
999	{
1000	print STDERR "Error executing pdfpstoimg.pl";
1001	if ($!) {print STDERR ": $!";}
1002	print STDERR "\n";
1003	}
1004
1005	#make sure the converter made something
1006	#if ($retval !=0) \|\| ! -s "$output_filestem")
1007	if ($retval !=0)
1008	{
1009	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1010	#print out the converter's std err, if any
1011	if (-s "$output_filestem.err") {
1012	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1013	print STDERR "pdfpstoimg error log:\n";
1014	while (<ERRLOG>) {
1015	print STDERR "$_";
1016	}
1017	close ERRLOG;
1018	}
1019	#&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
1020	if (-e "$output_filestem.err") {
1021	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1022	{
1023	open (ERRLOG, "$output_filestem.err");
1024	while (<ERRLOG>) {print FAILLOG $_;}
1025	close ERRLOG;
1026	close FAILLOG;
1027	}
1028	&FileUtils::removeFiles("$output_filestem.err");
1029	}
1030	return 0;
1031	}
1032	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1033	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1034	return 1;
1035	}
1036
1037	# Convert a PDF file to text with xpdftools' pdftotext command
1038	# Works for Windows too, whereas the old pdftotxt didn't
1039	sub xpdf_to_text {
1040	my ($dirname, $input_filename, $output_filestem) = @_;
1041
1042	my $cmd = "";
1043
1044	# build up the path to the doc-to-txt conversion tool we're going to use
1045	my $xpdf_pdftotxt = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftotext");
1046
1047	# For xpdf's pdftotxt options, see https://www.xpdfreader.com/pdftotext-man.html
1048	$cmd .= "\"$xpdf_pdftotxt\"";
1049	if($enc) {
1050	$cmd .= " -enc $enc"; # decode the bytes in the file using the designated encoding scheme
1051	} else {
1052	# as per https://www.xpdfreader.com/pdftotext-man.html
1053	# xpdf's pdftotxt defaults to using Latin-1 encoding, should we default to UTF-8?
1054	$cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html
1055	}
1056	$cmd .= " -nopgbrk";
1057	# Avoid the silly solitary carriage returns (CR in Notepad) at the end
1058	# of lines that ends up as \n appended to the doc title
1059	# by setting the end of line marker to unix style solitary newline (LF or \n),
1060	# which doesn't end up in the doc title
1061	$cmd .= " -eol unix";
1062	$cmd .= " \"$input_filename\" \"$output_filestem.text\"";
1063
1064	print STDERR "@@@@ Running command: $cmd\n";
1065
1066	return _run_pdf_to_text_cmd($cmd, $output_filestem);
1067	}
1068
1069	# Convert a PDF file to text with the pdftotext command
1070
1071	sub pdf_to_text {
1072	my ($dirname, $input_filename, $output_filestem) = @_;
1073
1074	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1075
1076	return _run_pdf_to_text_cmd($cmd, $output_filestem);
1077	}
1078
1079	sub _run_pdf_to_text_cmd {
1080	my ($cmd, $output_filestem) = @_;
1081
1082	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1083	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1084	} else {
1085	$cmd .= " > \"$output_filestem.err\"";
1086	}
1087
1088	if (system($cmd)!=0)
1089	{
1090	print STDERR "Error executing $cmd: $!\n";
1091	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1092	}
1093
1094	# make sure there is some extracted text.
1095	if (-e "$output_filestem.text") {
1096	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
1097	binmode(EXTR_TEXT); # just in case...
1098	my $line="";
1099	my $seen_text=0;
1100	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1101	if ($line=~ m/\w/) {$seen_text=1;}
1102	}
1103	close EXTR_TEXT;
1104	if ($seen_text==0) { # no text was extracted
1105	print STDERR "Error: pdftotext found no text\n";
1106	&FileUtils::removeFiles("$output_filestem.text");
1107	}
1108	}
1109
1110	# make sure the converter made something
1111	if (! -s "$output_filestem.text")
1112	{
1113	# print out the converters std err, if any
1114	if (-s "$output_filestem.err") {
1115	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1116	print STDERR "pdftotext error log:\n";
1117	while (<ERRLOG>) {
1118	print STDERR "$_";
1119	}
1120	close ERRLOG;
1121	}
1122	# does this converter create a .out file?
1123	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1124	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1125	if (-e "$output_filestem.err") {
1126	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1127	{
1128	open (ERRLOG,"$output_filestem.err");
1129	while (<ERRLOG>) {print FAILLOG $_;}
1130	close ERRLOG;
1131	close FAILLOG;
1132	}
1133	&FileUtils::removeFiles("$output_filestem.err");
1134	}
1135	return 0;
1136	}
1137	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1138	return 1;
1139	}
1140
1141	# Convert a PostScript document to text
1142	# note - just using "ps2ascii" isn't good enough, as it
1143	# returns 0 for a postscript interpreter error. ps2ascii is just
1144	# a wrapper to "gs" anyway, so we use that cmd here.
1145
1146	sub ps_to_text {
1147	my ($input_filename, $output_filestem) = @_;
1148
1149	my $error = "";
1150
1151	# if we're on windows we'll fall straight through without attempting
1152	# to use gs
1153	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1154	$error = "Windows does not support gs";
1155
1156	} else {
1157	my $cmd = "";
1158	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1159	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1160	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1161	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1162	$cmd .= " 2> $output_filestem.err";
1163	$!=0;
1164
1165	my $retcode=system($cmd);
1166	$retcode = $? >> 8; # see man perlfunc - system for this...
1167	# if system returns -1 \| 127 (couldn't start program), look at $! for message
1168
1169	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1170	elsif (! -e "$output_filestem.text") {
1171	$error="did not create output file.\n";
1172	}
1173	else
1174	{ # make sure the interpreter didn't get an error. It is technically
1175	# possible for the actual text to start with this, but....
1176	open PSOUT, "$output_filestem.text";
1177	if (<PSOUT> =~ m/^Error: (.*)/) {
1178	$error="interpreter error - \"$1\"";
1179	}
1180	close PSOUT;
1181	}
1182	}
1183
1184	if ($error ne "")
1185	{
1186	print STDERR "Warning: Error executing gs: $error\n";
1187	print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
1188	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1189
1190	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1191	{
1192	print FAILLOG "gs - $error\n";
1193	if (-e "$output_filestem.err") {
1194	open(ERRLOG, "$output_filestem.err");
1195	while (<ERRLOG>) {print FAILLOG $_;}
1196	close ERRLOG;
1197	}
1198	close FAILLOG;
1199	}
1200	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1201
1202
1203	# Fine then. We'll just do a lousy job by ourselves...
1204	# Based on 5-line regexp sed script found at:
1205	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1206	#
1207	print STDERR "Stripping text from postscript\n";
1208	my $errorcode=0;
1209	open (IN, "$input_filename")
1210	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1211	open (OUT, ">$output_filestem.text")
1212	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1213	if ($errorcode) {print STDERR "errors\n";return 0;}
1214
1215	my $text=""; # this is for whole .ps file...
1216	$text = join('', <IN>); # see man perlport, under "System Resources"
1217	close IN;
1218
1219	# Make sure this is a ps file...
1220	if ($text !~ m/^%!/) {
1221	print STDERR "Bad postscript header: not '%!'\n";
1222	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1223	{
1224	print FAILLOG "Bad postscript header: not '%!'\n";
1225	close FAILLOG;
1226	}
1227	return 0;
1228	}
1229
1230	# if ps has Page data, then use it to delete all stuff before it.
1231	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1232
1233	# remove all leading non-data stuff
1234	$text =~ s/^.*?\(//s;
1235
1236	# remove all newline chars for easier processing
1237	$text =~ s/\n//g;
1238
1239	# Big assumption here - assume that if any co-ordinates are
1240	# given, then we are at the end of a sentence.
1241	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1242
1243	# special characters--
1244	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1245
1246	# ? ps text formatting (eg italics?) ?
1247	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1248	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1249	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1250	# default - remove the rest
1251	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1252
1253	# attempt to add whitespace between words...
1254	# this is based purely on observation, and may be completely wrong...
1255	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1256	# eg I notice "b(" is sometimes NOT a space if preceded by a
1257	# negative number.
1258	$text =~ s/\)\d+ ?b\(/\) \( /g;
1259
1260	# change quoted braces to brackets
1261	$text =~ s/([^\\])\\\(/$1\{/g;
1262	$text =~ s/([^\\])\\\)/$1\}/g ;
1263
1264	# remove everything that is not between braces
1265	$text =~ s/\)([^\(\)])+?\(//sg ;
1266
1267	# remove any Trailer eof stuff.
1268	$text =~ s/\)[^\)]*$//sg;
1269
1270	### ligatures have special characters...
1271	$text =~ s/\\013/ff/g;
1272	$text =~ s/\\014/fi/g;
1273	$text =~ s/\\015/fl/g;
1274	$text =~ s/\\016/ffi/g;
1275	$text =~ s/\\214/fi/g;
1276	$text =~ s/\\215/fl/g;
1277	$text =~ s/\\017/\n\* /g; # asterisk?
1278	$text =~ s/\\023/\023/g; # e acute ('e)
1279	$text =~ s/\\177/\252/g; # u"
1280	# $text =~ s/ ?? /\344/g; # a"
1281
1282	print OUT "$text";
1283	close OUT;
1284	}
1285	# wrap the text - use a minimum length. ie, first space after this length.
1286	my $wrap_length=72;
1287	&FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1288	open INFILE, "$output_filestem.text.tmp" \|\|
1289	die "Couldn't open file: $!";
1290	open OUTFILE, ">$output_filestem.text" \|\|
1291	die "Couldn't open file for writing: $!";
1292	my $line="";
1293	while ($line=<INFILE>) {
1294	while (length($line)>0) {
1295	if (length($line)>$wrap_length) {
1296	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1297	print OUTFILE "$1\n";
1298	} else {
1299	print OUTFILE "$line";
1300	$line="";
1301	}
1302	}
1303	}
1304	close INFILE;
1305	close OUTFILE;
1306	&FileUtils::removeFiles("$output_filestem.text.tmp");
1307
1308	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1309	return 1;
1310	}
1311
1312
1313	# Convert any file to HTML with a crude perl implementation of the
1314	# UNIX strings command.
1315
1316	sub any_to_html {
1317	my ($input_filename, $output_filestem) = @_;
1318
1319	# First generate a text file
1320	return 0 unless (&any_to_text($input_filename, $output_filestem));
1321
1322	# create an HTML file from the text file
1323	open(TEXT, "<$output_filestem.text");
1324	open(HTML, ">$output_filestem.html");
1325
1326	print HTML "<html><head>\n";
1327	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1328	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1329	print HTML "</head><body>\n\n";
1330
1331	my $line;
1332	while ($line=<TEXT>) {
1333	$line =~ s/</</g;
1334	$line =~ s/>/>/g;
1335	if ($line =~ m/^\s*$/) {
1336	print HTML "<p>";
1337	} else {
1338	print HTML "<br> ", $line;
1339	}
1340	}
1341	print HTML "\n</body></html>\n";
1342
1343	close HTML;
1344	close TEXT;
1345
1346	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1347	return 1;
1348	}
1349
1350	# Convert any file to TEXT with a crude perl implementation of the
1351	# UNIX strings command.
1352	# Note - this assumes ascii charsets :( (jrm21)
1353
1354	sub any_to_text {
1355	my ($input_filename, $output_filestem) = @_;
1356
1357	if (!$use_strings) {
1358	return 0;
1359	}
1360
1361	print STDERR "\n** In any to text**\n\n";
1362	open(IN, "<$input_filename") \|\| return 0;
1363	binmode(IN);
1364	open(OUT, ">$output_filestem.text") \|\| return 0;
1365
1366	my ($line);
1367	my $output_line_count = 0;
1368	while (<IN>) {
1369	$line = $_;
1370
1371	# delete anything that isn't a printable character
1372	$line =~ s/[^\040-\176]+/\n/sg;
1373
1374	# delete any string less than 10 characters long
1375	$line =~ s/^.{0,9}$/\n/mg;
1376	while ($line =~ m/^.{1,9}$/m) {
1377	$line =~ s/^.{0,9}$/\n/mg;
1378	$line =~ s/\n+/\n/sg;
1379	}
1380
1381	# remove extraneous whitespace
1382	$line =~ s/\n+/\n/gs;
1383	$line =~ s/^\n//gs;
1384
1385	# output whatever is left
1386	if ($line =~ m/[^\n ]/) {
1387	print OUT $line;
1388	++$output_line_count;
1389	}
1390	}
1391
1392	close OUT;
1393	close IN;
1394
1395	if ($output_line_count) { # try to protect against binary only formats
1396	return 1;
1397	}
1398
1399	&FileUtils::removeFiles("$output_filestem.text");
1400	return 0;
1401
1402	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: