Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 32273

Last change on this file since 32273 was 32273, checked in by ak19, 6 years ago
First of the commits to do with restructuring and refactoring the PDFPlugin. 1. Introducing PDFv1Plugin.pm, which only runs the old pdftohtml. pdfbox_conversion are moved into PDFv2Plugin. 2. In the meantime we still have PDFPlugin, the current state of the plugin, for backward compatibility: it uses both the old pdftohtml tool and still has the pdfbox_conversion option. Yet to introduced the PDFv2Plugin. 3. gsConvert.pl has the new flag pdf_tool, set/passed in by PDFPlugin.pm and all PDFPlugin classes hereafter. The pdf_tool flag can be set to pdftohtml, xpdftools or pdfbox. PDFv1Plugin will always set it to pdftohtml, to denote the old pdftohtml tool is to be used, whereas PDFv2Plugin will set it to xpdftools and PDFBoxConverter sets it for symmetry's sake to pdfbox, even though being an AutoLoadConverter at present, the PDFBoxConverter class bypasses gsConvert.pl. gsConvert.pl uses the pdf_tool flag to determine which tool is to be used to do the conversion to produce the selected output_type. 4. Added some strings. One for migrating users to indicate that PDFPlugin was being deprecated in favour of the PDFv1 and PDFv2 plugins. Another was referenced by CommonUntil, and more recently by PDFPlugin, but was not defined in strings.properties. Once PDFv2Plugin has been added, need to remove references to paged_html from PDFPlugin.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 43.5 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use strict;
52
53	use parsargv;
54	use util;
55	use FileUtils;
56	use Cwd;
57
58	# Are we running on WinNT or Win2000 (or later)?
59	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62	my $use_strings;
63	my $pdf_tool;
64	my $pdf_complex;
65	my $pdf_nohidden;
66	my $pdf_zoom;
67	my $pdf_ignore_images;
68	my $pdf_allow_images_only;
69	my $windows_scripting;
70	my $enc;
71
72	sub print_usage
73	{
74	print STDERR "\n";
75	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
76	print STDERR " or text using third-party programs.\n\n";
77	print STDERR " usage: $0 [options] filename\n";
78	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
79	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
80	print STDERR "\t-output\tauto\|html\|paged_html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
81	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
82	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
83	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
84	print STDERR "\t-pdf_tool\tpdftohtml\|xpdftools\|pdfbox (not all output types are supported by every pdf_tool)\n";
85	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
86	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
87	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
88	print STDERR "\t\tconverting PDF to HTML\n";
89	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
90	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
91	print STDERR "\t\t-pdf_complex is set\n";
92	exit(1);
93	}
94
95	my $faillogfile="";
96	my $timeout=0;
97	my $verbosity=0;
98
99	sub main
100	{
101	my (@ARGV) = @_;
102	my ($input_type,$output_type,$verbose);
103
104	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
105	# is in use or not
106	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
107	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
108	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
109	# Currently only have VBA for Word and PPT(but no XLS)
110	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
111
112	my $type_re = $default_type_re;
113
114	foreach my $a (@ARGV) {
115	if ($a =~ m/^windows_scripting$/i) {
116	$type_re = $enhanced_type_re;
117	}
118	}
119
120	# read command-line arguments
121	if (!parsargv::parse(\@ARGV,
122	"type/$type_re/", \$input_type,
123	'/errlog/.*/', \$faillogfile,
124	'output/(auto\|html\|text\|pagedimg).*/', \$output_type, # regex includes html_multi and paged_html besides html
125	'timeout/\d+/0',\$timeout,
126	'verbose/\d+/0', \$verbose,
127	'windows_scripting',\$windows_scripting,
128	'use_strings', \$use_strings,
129	'pdf_tool/(pdftohtml\|pdfbox\|xpdftools)/', \$pdf_tool, # the old pdftohtml tool, pdfbox extensions or the newer xpdf-tools
130	'pdf_complex', \$pdf_complex, # options for pdf_tool = pdftohtml (the old pdftohtml tool)
131	'pdf_ignore_images', \$pdf_ignore_images,
132	'pdf_allow_images_only', \$pdf_allow_images_only,
133	'pdf_nohidden', \$pdf_nohidden,
134	'pdf_zoom/\d+/2', \$pdf_zoom
135	))
136	{
137	print_usage();
138	}
139
140	$verbosity=$verbose if defined $verbose;
141
142	# Make sure the input file exists and can be opened for reading
143	if (scalar(@ARGV!=1)) {
144	print_usage();
145	}
146
147	my $input_filename = $ARGV[0];
148	if (!-r $input_filename) {
149	print STDERR "Error: unable to open $input_filename for reading\n";
150	exit(1);
151	}
152
153	# Deduce filenames
154	my ($tailname,$dirname,$suffix)
155	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
156	my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
157
158	if ($input_type eq "")
159	{
160	$input_type = lc (substr($suffix,1,length($suffix)-1));
161	}
162
163	# Change to temporary working directory
164	my $stored_dir = cwd();
165	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
166
167	# Select convert utility
168	if (!defined $input_type) {
169	print STDERR "Error: No filename extension or input type defined\n";
170	exit(1);
171	}
172	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
173	print &convertDOC($input_filename, $output_filestem, $output_type);
174	print "\n";
175	}
176	elsif ($input_type eq "rtf") {
177	print &convertRTF($input_filename, $output_filestem, $output_type);
178	print "\n";
179	}
180	elsif ($input_type eq "pdf") {
181	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
182	print "\n";
183	}
184	elsif ($input_type eq "ps") {
185	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
186	print "\n";
187	}
188	elsif ($input_type =~ m/pptx?$/) {
189	print &convertPPT($input_filename, $output_filestem, $output_type);
190	print "\n";
191	}
192	elsif ($input_type =~ m/xlsx?$/) {
193	print &convertXLS($input_filename, $output_filestem, $output_type);
194	print "\n";
195	}
196	else {
197	print STDERR "Error: Unable to convert type '$input_type'\n";
198	exit(1);
199	}
200
201	# restore to original working directory
202	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
203
204	}
205
206	&main(@ARGV);
207
208
209
210	# Document-type conversion functions
211	#
212	# The following functions attempt to convert documents from their
213	# input type to the specified output type. If no output type was
214	# given, then they first attempt HTML, and then TEXT.
215	#
216	# Each returns the output type ("html" or "text") or "fail" if no
217	# conversion is possible.
218
219	# Convert a Microsoft word document
220
221	sub convertDOC {
222	my ($input_filename, $output_filestem, $output_type) = @_;
223
224	# Many .doc files are not in fact word documents!
225	my $realtype = &find_docfile_type($input_filename);
226
227	if ($realtype eq "word6" \|\| $realtype eq "word7"
228	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
229	return &convertWord678($input_filename, $output_filestem, $output_type);
230	} elsif ($realtype eq "rtf") {
231	return &convertRTF($input_filename, $output_filestem, $output_type);
232	} else {
233	return &convertAnything($input_filename, $output_filestem, $output_type);
234	}
235	}
236
237	# Convert a Microsoft word 6/7/8 document
238
239	sub convertWord678 {
240	my ($input_filename, $output_filestem, $output_type) = @_;
241
242	my $success = 0;
243	if (!$output_type \|\| ($output_type =~ m/html/i)){
244	if ($windows_scripting) {
245	$success = &native_doc_to_html($input_filename, $output_filestem);
246	}
247	else {
248	$success = &doc_to_html($input_filename, $output_filestem);
249	}
250	if ($success) {
251	return "html";
252	}
253	}
254	return &convertAnything($input_filename, $output_filestem, $output_type);
255	}
256
257
258	# Convert a Rich Text Format (RTF) file
259
260	sub convertRTF {
261	my ($input_filename, $output_filestem, $output_type) = @_;
262
263	my $success = 0;
264
265	# Attempt specialised conversion to HTML
266	if (!$output_type \|\| ($output_type =~ m/html/i)) {
267
268	if ($windows_scripting) {
269	$success = &native_doc_to_html($input_filename, $output_filestem);
270	}
271	else {
272	$success = &rtf_to_html($input_filename, $output_filestem);
273	}
274	if ($success) {
275	return "html";
276	}
277	}
278
279	# rtf is so ugly that's it's not worth running strings over.
280	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
281	# return &convertAnything($input_filename, $output_filestem, $output_type);
282	return "fail";
283	}
284
285
286	# Convert an unidentified file
287
288	sub convertAnything {
289	my ($input_filename, $output_filestem, $output_type) = @_;
290
291	my $success = 0;
292
293	# Attempt simple conversion to HTML
294	if (!$output_type \|\| ($output_type =~ m/html/i)) {
295	$success = &any_to_html($input_filename, $output_filestem);
296	if ($success) {
297	return "html";
298	}
299	}
300
301	# Convert to text
302	if (!$output_type \|\| ($output_type =~ m/text/i)) {
303	$success = &any_to_text($input_filename, $output_filestem);
304	if ($success) {
305	return "text";
306	}
307	}
308	return "fail";
309	}
310
311
312
313	# Convert an Adobe PDF document
314
315	sub convertPDF {
316	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
317
318	my $success = 0;
319	$output_type =~ s/.\-(.)/$1/i;
320
321	# First determine which pdf conversion tool we're using among pdftohtml/pdfbox/xpdftools
322	# and then decide which conversion command to run based on the output type
323	# (pdfbox does not currently go through gsConvert.pl
324	# as PDFBoxConverter inherits from AutoLoadConverters)
325
326	if ($pdf_tool eq "pdftohtml" ) { # old pdftohtml tool
327	# Attempt coversion to Image
328	if ($output_type =~ m/jp?g\|gif\|png/i) {
329	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
330	if ($success){
331	return "item";
332	}
333	}
334
335	# Attempt conversion to HTML
336	# Uses the old pdftohtml that doesn't work for newer PDF versions
337	if ($output_type =~ m/^html/i) {
338	#if (!$output_type \|\| ($output_type =~ m/^html/i)) {
339	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
340	if ($success) {
341	return "html";
342	}
343	}
344
345	# Attempt conversion to TEXT (not for Windows, but PDFPlugin/PDFv1Plugin takes care of that
346	if (!$output_type \|\| ($output_type =~ m/text/i)) {
347	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
348
349	if ($success) {
350	return "text";
351	}
352	}
353	}
354
355	elsif ($pdf_tool eq "xpdftools" ) {
356	# default to html output
357	if (!$output_type) {
358	$output_type = "html";
359	}
360
361	# Attempt coversion to Image
362	#if ($output_type =~ m/jp?g\|gif\|png/i) {
363	# $success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
364	# if ($success){
365	# return "item";
366	# }
367	#}
368
369	# Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools.
370	if ($output_type =~ m/^(paged_html\|html)$/i) {
371	$success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
372	if ($success) {
373	return $output_type;
374	}
375	}
376
377	# Attempt conversion to TEXT
378	if (!$output_type \|\| ($output_type =~ m/text/i)) {
379	$success = &xpdf_to_text($dirname, $input_filename, $output_filestem);
380
381	if ($success) {
382	return "text";
383	}
384	}
385	}
386
387	return "fail";
388
389	}
390
391
392	# Convert an Adobe PostScript document
393
394	sub convertPS {
395	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
396
397	my $success = 0;
398	$output_type =~ s/.\-(.)/$1/i;
399	# Attempt coversion to Image
400	if ($output_type =~ m/jp?g\|gif\|png/i) {
401	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
402	if ($success){
403	return "item";
404	}
405	}
406
407	# Attempt conversion to TEXT
408	if (!$output_type \|\| ($output_type =~ m/text/i)) {
409	$success = &ps_to_text($input_filename, $output_filestem);
410	if ($success) {
411	return "text";
412	}
413	}
414	return "fail";
415	}
416
417
418	sub convertPPT {
419	my ($input_filename, $output_filestem, $output_type) = @_;
420	my $success = 0;
421
422	my $ppt_convert_type = "";
423
424	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
425	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
426	if ($output_type =~ m/gif/i) {
427	$ppt_convert_type = "-g";
428	} elsif ($output_type =~ m/jp?g/i){
429	$ppt_convert_type = "-j";
430	} elsif ($output_type =~ m/png/i){
431	$ppt_convert_type = "-p";
432	}
433	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
434	$ENV{'GSDLOS'}, "pptextract");
435	$vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
436	# $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
437
438	my $cmd = "";
439	if ($timeout) {$cmd = "ulimit -t $timeout;";}
440	# if the converting directory already exists
441	if (-d $output_filestem) {
442	print STDERR "**The conversion directory already exists\n";
443	return "item";
444	} else {
445	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
446	$cmd .= " 2>\"$output_filestem.err\""
447	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
448
449	if (system($cmd) !=0) {
450	print STDERR "Powerpoint VB Scripting convert failed\n";
451	} else {
452	return "item";
453	}
454	}
455	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
456	# Attempt conversion to HTML
457	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
458	# formulate the command
459	my $cmd = "";
460	my $full_perl_path = &util::get_perl_exec();
461	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
462	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
463	$cmd .= " 2>\"$output_filestem.err\""
464	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
465
466	# execute the command
467	$!=0;
468	if (system($cmd)!=0)
469	{
470	print STDERR "Powerpoint 95/97 converter failed $!\n";
471	} else {
472	return "html";
473	}
474	}
475
476	$success = &any_to_text($input_filename, $output_filestem);
477	if ($success) {
478	return "text";
479	}
480
481	return "fail";
482	}
483
484
485	sub convertXLS {
486	my ($input_filename, $output_filestem, $output_type) = @_;
487
488	my $success = 0;
489
490	# Attempt conversion to HTML
491	if (!$output_type \|\| ($output_type =~ m/html/i)) {
492	# formulate the command
493	my $cmd = "";
494	my $full_perl_path = &util::get_perl_exec();
495	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
496	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
497	$cmd .= " 2>\"$output_filestem.err\""
498	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
499
500
501	# execute the command
502	$!=0;
503	if (system($cmd)!=0)
504	{
505	print STDERR "Excel 95/97 converter failed $!\n";
506	} else {
507	return "html";
508	}
509	}
510
511	$success = &any_to_text($input_filename, $output_filestem);
512	if ($success) {
513	return "text";
514	}
515
516	return "fail";
517	}
518
519
520
521	# Find the real type of a .doc file
522	#
523	# We seem to have a lot of files with a .doc extension that are .rtf
524	# files or Word 5 files. This function attempts to tell the difference.
525	sub find_docfile_type {
526	my ($input_filename) = @_;
527
528	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
529	return "docx";
530	}
531
532	open(CHK, "<$input_filename");
533	binmode(CHK);
534	my $line = "";
535	my $first = 1;
536
537	while (<CHK>) {
538
539	$line = $_;
540
541	if ($first) {
542	# check to see if this is an rtf file
543	if ($line =~ m/^\{\\rtf/) {
544	close(CHK);
545	return "rtf";
546	}
547	$first = 0;
548	}
549
550	# is this is a word 6/7/8 document?
551	if ($line =~ m/Word\.Document\.([678])/) {
552	close(CHK);
553
554	return "word$1";
555	}
556
557	}
558
559	return "unknown";
560	}
561
562
563	# Specific type-to-type conversions
564	#
565	# Each of the following functions attempts to convert a document from
566	# a specific format to another. If they succeed they return 1 and leave
567	# the output document(s) in the appropriate place; if they fail they
568	# return 0 and delete any working files.
569
570
571	# Attempt to convert a word document to html with the wv program
572	sub doc_to_html {
573	my ($input_filename, $output_filestem) = @_;
574
575	my $wvware_status = 0;
576
577	# need to ensure that the path to perl is quoted (in case there's spaces in it)
578	my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
579
580	# print STDERR "***** wvware launch cmd = $launch_cmd\n";
581
582	$wvware_status = system($launch_cmd)/256;
583	return $wvware_status;
584	}
585
586	# Attempt to convert a word document to html with the word2html scripting program
587	sub native_doc_to_html {
588	my ($input_filename, $output_filestem) = @_;
589
590	# build up the path to the doc-to-html conversion tool we're going to use
591	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
592
593	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
594	# if windows scripting with docx input, use new VBscript to get the local Word install (if
595	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
596
597	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
598	# else script launch fails when there are error msgs
599	$vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
600	$vbScript = "CScript //Nologo \"$vbScript\""; # launch with CScript for error output in STDERR
601	# //Nologo flag avoids Microsoft's opening/logo msgs
602	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
603	print STDERR " This may take some time. Please wait...\n";
604	}
605	else { # old doc versions. use the usual VB executable word2html for the
606	# conversion. Doesn't need full path, since bin\windows is on PATH
607	$vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
608	}
609	}
610	else { # not windows
611	$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
612	}
613
614	if (-e "$output_filestem.html") {
615	print STDERR " The conversion file:\n";
616	print STDERR " $output_filestem.html\n";
617	print STDERR " ... already exists. Skipping\n";
618	return 1;
619	}
620
621	my $cmd = "";
622	if ($timeout) {$cmd = "ulimit -t $timeout;";}
623	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
624	#$cmd .= "$vbScript $input_filename $output_filestem.html";
625	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
626
627	# redirecting STDERR
628
629	$cmd .= " 2> \"$output_filestem.err\""
630	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
631	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
632
633	# execute the command
634	$!=0;
635	if (system($cmd)!=0)
636	{
637	print STDERR "Error executing $vbScript converter:$!\n";
638	if (-s "$output_filestem.err") {
639	open (ERRFILE, "<$output_filestem.err");
640
641	my $write_to_fail_log=0;
642	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
643	{$write_to_fail_log=1;}
644
645	my $line;
646	while ($line=<ERRFILE>) {
647	if ($line =~ m/\w/) {
648	print STDERR "$line";
649	print FAILLOG "$line" if ($write_to_fail_log);
650	}
651	if ($line !~ m/startup error/) {next;}
652	print STDERR " (given an invalid .DOC file?)\n";
653	print FAILLOG " (given an invalid .DOC file?)\n"
654	if ($write_to_fail_log);
655
656	} # while ERRFILE
657	close FAILLOG if ($write_to_fail_log);
658	}
659	return 0; # we can try any_to_text
660	}
661
662	# Was the conversion successful?
663	if (-s "$output_filestem.html") {
664	open(TMP, "$output_filestem.html");
665	my $line = <TMP>;
666	close(TMP);
667	if ($line && $line =~ m/html/i) {
668	&FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
669	return 1;
670	}
671	}
672
673	# If here, an error of some sort occurred
674	&FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
675	if (-e "$output_filestem.err") {
676	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
677	open (ERRLOG,"$output_filestem.err");
678	while (<ERRLOG>) {print FAILLOG $_;}
679	close FAILLOG;
680	close ERRLOG;
681	}
682	&FileUtils::removeFiles("$output_filestem.err");
683	}
684	return 0;
685	}
686
687	# Attempt to convert an RTF document to html with rtftohtml
688	sub rtf_to_html {
689	my ($input_filename, $output_filestem) = @_;
690
691	# formulate the command
692	my $cmd = "";
693	if ($timeout) {$cmd = "ulimit -t $timeout;";}
694	$cmd .= "rtftohtml";
695	#$cmd .= "rtf-converter";
696
697	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
698
699	$cmd .= " 2>\"$output_filestem.err\""
700	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
701
702
703	# execute the command
704	$!=0;
705	if (system($cmd)!=0)
706	{
707	print STDERR "Error executing rtf converter $!\n";
708	# don't currently bother printing out error log...
709	# keep going, in case it still created an HTML file...
710	}
711
712	# Was the conversion successful?
713	my $was_successful=0;
714	if (-s "$output_filestem.html") {
715	# make sure we have some content other than header
716	open (HTML, "$output_filestem.html"); # what to do if fail?
717	my $line;
718	my $past_header=0;
719	while ($line=<HTML>) {
720
721	if ($past_header == 0) {
722	if ($line =~ m/<body>/) {$past_header=1;}
723	next;
724	}
725
726	$line =~ s/<[^>]+>//g;
727	if ($line =~ m/\w/ && $past_header) { # we found some content...
728	$was_successful=1;
729	last;
730	}
731	}
732	close HTML;
733	}
734
735	if ($was_successful) {
736	&FileUtils::removeFiles("$output_filestem.err")
737	if (-e "$output_filestem.err");
738	# insert the (modified) table of contents, if it exists.
739	if (-e "${output_filestem}_ToC.html") {
740	&FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
741	my $open_failed=0;
742	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
743	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
744	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
745
746	if ($open_failed) {
747	close HTMLSRC;
748	close TOC;
749	close HTML;
750	&FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
751	return 1;
752	}
753
754	# print out header info from src html.
755	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
756	print HTML "$_";
757	}
758
759	# print out table of contents, making links relative
760	<TOC>; <TOC>; # ignore first 2 lines
761	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
762	my $line;
763	while ($line=<TOC>) {
764	$line =~ s@</body></html>$@@i ; # only last line has this
765	# make link relative
766	$line =~ s@href=\"[^\#]+@href=\"@i;
767	print HTML $line;
768	}
769	close TOC;
770
771	# rest of html src
772	while (<HTMLSRC>) {
773	print HTML $_;
774	}
775	close HTMLSRC;
776	close HTML;
777
778	&FileUtils::removeFiles("${output_filestem}_ToC.html");
779	&FileUtils::removeFiles("${output_filestem}.src");
780	}
781	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
782	return 1; # success
783	}
784
785	if (-e "$output_filestem.err") {
786	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
787	{
788	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
789	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
790	print FAILLOG " (rtf file might be too recent):\n";
791	open (ERRLOG, "$output_filestem.err");
792	while (<ERRLOG>) {print FAILLOG $_;}
793	close ERRLOG;
794	close FAILLOG;
795	}
796	&FileUtils::removeFiles("$output_filestem.err");
797	}
798
799	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
800
801	return 0;
802	}
803
804
805	# Convert a pdf file to html with the old pdftohtml command
806	# which only works for older PDF versions
807	sub pdf_to_html {
808	my ($dirname, $input_filename, $output_filestem) = @_;
809
810	my $cmd = "";
811	if ($timeout) {$cmd = "ulimit -t $timeout;";}
812	my $full_perl_path = &util::get_perl_exec();
813	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
814	$cmd .= " -c" if ($pdf_complex);
815	$cmd .= " -i" if ($pdf_ignore_images);
816	$cmd .= " -a" if ($pdf_allow_images_only);
817	$cmd .= " -hidden" unless ($pdf_nohidden);
818	$cmd .= " \"$input_filename\" \"$output_filestem\"";
819
820	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
821	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
822	} else {
823	$cmd .= " > \"$output_filestem.err\"";
824	}
825
826	$!=0;
827
828	my $retval=system($cmd);
829	if ($retval!=0)
830	{
831	print STDERR "Error executing pdftohtml.pl";
832	if ($!) {print STDERR ": $!";}
833	print STDERR "\n";
834	}
835
836	# make sure the converter made something
837	if ($retval!=0 \|\| ! -s "$output_filestem.html")
838	{
839	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
840	# print out the converter's std err, if any
841	if (-s "$output_filestem.err") {
842	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
843	print STDERR "pdftohtml error log:\n";
844	while (<ERRLOG>) {
845	print STDERR "$_";
846	}
847	close ERRLOG;
848	}
849	#print STDERR "***********output filestem $output_filestem.html\n";
850	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
851	if (-e "$output_filestem.err") {
852	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
853	{
854	open (ERRLOG, "$output_filestem.err");
855	while (<ERRLOG>) {print FAILLOG $_;}
856	close ERRLOG;
857	close FAILLOG;
858	}
859	&FileUtils::removeFiles("$output_filestem.err");
860	}
861	return 0;
862	}
863
864	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
865	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
866	return 1;
867	}
868
869
870	# Convert a pdf file to html with the newer Xpdftools' pdftohtml
871	# This generates "paged HTML" where extracted, selectable text is positioned
872	# over screenshots of each page.
873	# Since xpdf's pdftohtml fails if the output dir already exists and for easier
874	# naming, the output files are created in a "pages" subdirectory of the tmp
875	# location parent of $output_filestem instead
876	sub xpdf_to_html {
877	my ($dirname, $input_filename, $output_filestem) = @_;
878
879	my $cmd = "";
880
881	# build up the path to the doc-to-html conversion tool we're going to use
882	my $xpdf_pdftohtml = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftohtml");
883
884	# We'll create the file by name $output_filestem during post-conversion processing.
885	# Note that Xpdf tools will only create its conversion products in a dir that does
886	# not yet exist. So we'll create this location as a subdir of the output_filestem's
887	# parent directory. The parent dir is the already generated tmp area for conversion. So:
888	# - tmpdir gs2build/tmp/<random-num> already exists at this stage
889	# - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
890	# - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
891	my ($tailname, $tmp_dirname, $suffix)
892	= &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
893	$tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
894
895	# xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
896	$cmd .= "\"$xpdf_pdftohtml\"";
897	$cmd .= " -z $pdf_zoom" if ($pdf_zoom);
898	# $cmd .= " -c" if ($pdf_complex);
899	# $cmd .= " -i" if ($pdf_ignore_images);
900	# $cmd .= " -a" if ($pdf_allow_images_only);
901	# $cmd .= " -hidden" unless ($pdf_nohidden);
902	$cmd .= " \"$input_filename\" \"$tmp_dirname\"";
903	#$cmd .= " \"$input_filename\" \"$output_filestem\"";
904
905	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
906	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
907	} else {
908	$cmd .= " > \"$output_filestem.err\"";
909	}
910
911	#print STDERR "@@@@ Running command: $cmd\n";
912
913	$!=0;
914	my $retval=system($cmd);
915	if ($retval!=0)
916	{
917	print STDERR "Error executing xpdf's pdftohtml tool";
918	if ($!) {print STDERR ": $!";}
919	print STDERR "\n";
920	}
921
922	# make sure the converter made something
923	if ($retval!=0 \|\| ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
924	{
925	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
926	# print out the converter's std err, if any
927	if (-s "$output_filestem.err") {
928	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
929	print STDERR "pdftohtml error log:\n";
930	while (<ERRLOG>) {
931	print STDERR "$_";
932	}
933	close ERRLOG;
934	}
935	#print STDERR "***********output filestem $output_filestem.html\n";
936	&FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
937	if (-e "$output_filestem.err") {
938	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
939	{
940	open (ERRLOG, "$output_filestem.err");
941	while (<ERRLOG>) {print FAILLOG $_;}
942	close ERRLOG;
943	close FAILLOG;
944	}
945	&FileUtils::removeFiles("$output_filestem.err");
946	}
947	return 0;
948	}
949
950	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
951	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
952	return 1;
953	}
954
955	# Returns the path to xpdf-tools's containing bin dir appropriate for this machine's OS and bitness
956	sub _get_xpdftools_bindir {
957
958	# build up the path to the containing bin dir of the xpdf conversion tool we're going to use
959	my $xpdf_tools_bin = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools", "bin");
960	return $xpdf_tools_bin;
961	}
962
963	# Convert a pdf file to various types of image with the convert command
964
965	sub pdfps_to_img {
966	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
967
968	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
969	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
970	my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
971	$imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
972	my $result = `$imagick_cmd identify 2>&1`;
973
974	# Linux and Windows return different values for "program not found".
975	# Linux returns -1 and Windows 256 for "program not found". But once they're
976	# converted to signed values, it will be -1 for Linux and 1 for Windows.
977	# Whenever we test for return values other than 0, shift by 8 and perform
978	# unsigned to signed status conversion on $? to get expected range of return vals
979	# Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
980	# and then exits on that, by the time we get here, we need to do it again
981	my $status = $?;
982	$status >>= 8;
983	$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
984	if (($ENV{'GSDLOS'} ne "windows" && $status == -1) \|\| ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
985	# if ($status == -1 \|\| $status == 1) #if ($status == -1 \|\| $status == 256) {
986	#ImageMagick is not installed, thus the convert utility is not available.
987	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
988	return 0;
989	}
990	}
991
992	my $cmd = "";
993	if ($timeout) {$cmd = "ulimit -t $timeout;";}
994	$output_type =~ s/.\_(.)/$1/i;
995	my $full_perl_path = &util::get_perl_exec();
996	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
997	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
998	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
999	} else {
1000	$cmd .= " > \"$output_filestem.err\"";
1001	}
1002
1003	# don't include path on windows (to avoid having to play about
1004	# with quoting when GSDLHOME might contain spaces) but assume
1005	# that the PATH is set up correctly
1006	$!=0;
1007	my $retval=system($cmd);
1008	if ($retval!=0)
1009	{
1010	print STDERR "Error executing pdfpstoimg.pl";
1011	if ($!) {print STDERR ": $!";}
1012	print STDERR "\n";
1013	}
1014
1015	#make sure the converter made something
1016	#if ($retval !=0) \|\| ! -s "$output_filestem")
1017	if ($retval !=0)
1018	{
1019	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1020	#print out the converter's std err, if any
1021	if (-s "$output_filestem.err") {
1022	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1023	print STDERR "pdfpstoimg error log:\n";
1024	while (<ERRLOG>) {
1025	print STDERR "$_";
1026	}
1027	close ERRLOG;
1028	}
1029	#&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
1030	if (-e "$output_filestem.err") {
1031	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1032	{
1033	open (ERRLOG, "$output_filestem.err");
1034	while (<ERRLOG>) {print FAILLOG $_;}
1035	close ERRLOG;
1036	close FAILLOG;
1037	}
1038	&FileUtils::removeFiles("$output_filestem.err");
1039	}
1040	return 0;
1041	}
1042	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1043	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1044	return 1;
1045	}
1046
1047	# Convert a PDF file to text with xpdftools' pdftotext command
1048	# Works for Windows too, whereas the old pdftotxt didn't
1049	sub xpdf_to_text {
1050	my ($dirname, $input_filename, $output_filestem) = @_;
1051
1052	my $cmd = "";
1053
1054	# build up the path to the doc-to-txt conversion tool we're going to use
1055	my $xpdf_pdftotxt = &FileUtils::filenameConcatenate(_get_xpdftools_bindir(), "pdftotext");
1056
1057	# For xpdf's pdftotxt options, see https://www.xpdfreader.com/pdftotext-man.html
1058	$cmd .= "\"$xpdf_pdftotxt\"";
1059	if($enc) {
1060	$cmd .= " -enc $enc"; # decode the bytes in the file using the designated encoding scheme
1061	} else {
1062	# as per https://www.xpdfreader.com/pdftotext-man.html
1063	# xpdf's pdftotxt defaults to using Latin-1 encoding, should we default to UTF-8?
1064	$cmd .= " -enc UTF-8"; # see https://www.xpdfreader.com/xpdfrc-man.html
1065	}
1066	$cmd .= " -nopgbrk";
1067	# Avoid the silly solitary carriage returns (CR in Notepad) at the end
1068	# of lines that ends up as \n appended to the doc title
1069	# by setting the end of line marker to unix style solitary newline (LF or \n),
1070	# which doesn't end up in the doc title
1071	$cmd .= " -eol unix";
1072	$cmd .= " \"$input_filename\" \"$output_filestem.text\"";
1073
1074	print STDERR "@@@@ Running command: $cmd\n";
1075
1076	return _run_pdf_to_text_cmd($cmd, $output_filestem);
1077	}
1078
1079	# Convert a PDF file to text with the pdftotext command
1080
1081	sub pdf_to_text {
1082	my ($dirname, $input_filename, $output_filestem) = @_;
1083
1084	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1085
1086	return _run_pdf_to_text_cmd($cmd, $output_filestem);
1087	}
1088
1089	sub _run_pdf_to_text_cmd {
1090	my ($cmd, $output_filestem) = @_;
1091
1092	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1093	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1094	} else {
1095	$cmd .= " > \"$output_filestem.err\"";
1096	}
1097
1098	if (system($cmd)!=0)
1099	{
1100	print STDERR "Error executing $cmd: $!\n";
1101	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1102	}
1103
1104	# make sure there is some extracted text.
1105	if (-e "$output_filestem.text") {
1106	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
1107	binmode(EXTR_TEXT); # just in case...
1108	my $line="";
1109	my $seen_text=0;
1110	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1111	if ($line=~ m/\w/) {$seen_text=1;}
1112	}
1113	close EXTR_TEXT;
1114	if ($seen_text==0) { # no text was extracted
1115	print STDERR "Error: pdftotext found no text\n";
1116	&FileUtils::removeFiles("$output_filestem.text");
1117	}
1118	}
1119
1120	# make sure the converter made something
1121	if (! -s "$output_filestem.text")
1122	{
1123	# print out the converters std err, if any
1124	if (-s "$output_filestem.err") {
1125	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1126	print STDERR "pdftotext error log:\n";
1127	while (<ERRLOG>) {
1128	print STDERR "$_";
1129	}
1130	close ERRLOG;
1131	}
1132	# does this converter create a .out file?
1133	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1134	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1135	if (-e "$output_filestem.err") {
1136	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1137	{
1138	open (ERRLOG,"$output_filestem.err");
1139	while (<ERRLOG>) {print FAILLOG $_;}
1140	close ERRLOG;
1141	close FAILLOG;
1142	}
1143	&FileUtils::removeFiles("$output_filestem.err");
1144	}
1145	return 0;
1146	}
1147	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1148	return 1;
1149	}
1150
1151	# Convert a PostScript document to text
1152	# note - just using "ps2ascii" isn't good enough, as it
1153	# returns 0 for a postscript interpreter error. ps2ascii is just
1154	# a wrapper to "gs" anyway, so we use that cmd here.
1155
1156	sub ps_to_text {
1157	my ($input_filename, $output_filestem) = @_;
1158
1159	my $error = "";
1160
1161	# if we're on windows we'll fall straight through without attempting
1162	# to use gs
1163	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1164	$error = "Windows does not support gs";
1165
1166	} else {
1167	my $cmd = "";
1168	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1169	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1170	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1171	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1172	$cmd .= " 2> $output_filestem.err";
1173	$!=0;
1174
1175	my $retcode=system($cmd);
1176	$retcode = $? >> 8; # see man perlfunc - system for this...
1177	# if system returns -1 \| 127 (couldn't start program), look at $! for message
1178
1179	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1180	elsif (! -e "$output_filestem.text") {
1181	$error="did not create output file.\n";
1182	}
1183	else
1184	{ # make sure the interpreter didn't get an error. It is technically
1185	# possible for the actual text to start with this, but....
1186	open PSOUT, "$output_filestem.text";
1187	if (<PSOUT> =~ m/^Error: (.*)/) {
1188	$error="interpreter error - \"$1\"";
1189	}
1190	close PSOUT;
1191	}
1192	}
1193
1194	if ($error ne "")
1195	{
1196	print STDERR "Warning: Error executing gs: $error\n";
1197	print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
1198	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1199
1200	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1201	{
1202	print FAILLOG "gs - $error\n";
1203	if (-e "$output_filestem.err") {
1204	open(ERRLOG, "$output_filestem.err");
1205	while (<ERRLOG>) {print FAILLOG $_;}
1206	close ERRLOG;
1207	}
1208	close FAILLOG;
1209	}
1210	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1211
1212
1213	# Fine then. We'll just do a lousy job by ourselves...
1214	# Based on 5-line regexp sed script found at:
1215	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1216	#
1217	print STDERR "Stripping text from postscript\n";
1218	my $errorcode=0;
1219	open (IN, "$input_filename")
1220	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1221	open (OUT, ">$output_filestem.text")
1222	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1223	if ($errorcode) {print STDERR "errors\n";return 0;}
1224
1225	my $text=""; # this is for whole .ps file...
1226	$text = join('', <IN>); # see man perlport, under "System Resources"
1227	close IN;
1228
1229	# Make sure this is a ps file...
1230	if ($text !~ m/^%!/) {
1231	print STDERR "Bad postscript header: not '%!'\n";
1232	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1233	{
1234	print FAILLOG "Bad postscript header: not '%!'\n";
1235	close FAILLOG;
1236	}
1237	return 0;
1238	}
1239
1240	# if ps has Page data, then use it to delete all stuff before it.
1241	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1242
1243	# remove all leading non-data stuff
1244	$text =~ s/^.*?\(//s;
1245
1246	# remove all newline chars for easier processing
1247	$text =~ s/\n//g;
1248
1249	# Big assumption here - assume that if any co-ordinates are
1250	# given, then we are at the end of a sentence.
1251	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1252
1253	# special characters--
1254	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1255
1256	# ? ps text formatting (eg italics?) ?
1257	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1258	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1259	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1260	# default - remove the rest
1261	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1262
1263	# attempt to add whitespace between words...
1264	# this is based purely on observation, and may be completely wrong...
1265	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1266	# eg I notice "b(" is sometimes NOT a space if preceded by a
1267	# negative number.
1268	$text =~ s/\)\d+ ?b\(/\) \( /g;
1269
1270	# change quoted braces to brackets
1271	$text =~ s/([^\\])\\\(/$1\{/g;
1272	$text =~ s/([^\\])\\\)/$1\}/g ;
1273
1274	# remove everything that is not between braces
1275	$text =~ s/\)([^\(\)])+?\(//sg ;
1276
1277	# remove any Trailer eof stuff.
1278	$text =~ s/\)[^\)]*$//sg;
1279
1280	### ligatures have special characters...
1281	$text =~ s/\\013/ff/g;
1282	$text =~ s/\\014/fi/g;
1283	$text =~ s/\\015/fl/g;
1284	$text =~ s/\\016/ffi/g;
1285	$text =~ s/\\214/fi/g;
1286	$text =~ s/\\215/fl/g;
1287	$text =~ s/\\017/\n\* /g; # asterisk?
1288	$text =~ s/\\023/\023/g; # e acute ('e)
1289	$text =~ s/\\177/\252/g; # u"
1290	# $text =~ s/ ?? /\344/g; # a"
1291
1292	print OUT "$text";
1293	close OUT;
1294	}
1295	# wrap the text - use a minimum length. ie, first space after this length.
1296	my $wrap_length=72;
1297	&FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1298	open INFILE, "$output_filestem.text.tmp" \|\|
1299	die "Couldn't open file: $!";
1300	open OUTFILE, ">$output_filestem.text" \|\|
1301	die "Couldn't open file for writing: $!";
1302	my $line="";
1303	while ($line=<INFILE>) {
1304	while (length($line)>0) {
1305	if (length($line)>$wrap_length) {
1306	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1307	print OUTFILE "$1\n";
1308	} else {
1309	print OUTFILE "$line";
1310	$line="";
1311	}
1312	}
1313	}
1314	close INFILE;
1315	close OUTFILE;
1316	&FileUtils::removeFiles("$output_filestem.text.tmp");
1317
1318	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1319	return 1;
1320	}
1321
1322
1323	# Convert any file to HTML with a crude perl implementation of the
1324	# UNIX strings command.
1325
1326	sub any_to_html {
1327	my ($input_filename, $output_filestem) = @_;
1328
1329	# First generate a text file
1330	return 0 unless (&any_to_text($input_filename, $output_filestem));
1331
1332	# create an HTML file from the text file
1333	open(TEXT, "<$output_filestem.text");
1334	open(HTML, ">$output_filestem.html");
1335
1336	print HTML "<html><head>\n";
1337	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1338	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1339	print HTML "</head><body>\n\n";
1340
1341	my $line;
1342	while ($line=<TEXT>) {
1343	$line =~ s/</</g;
1344	$line =~ s/>/>/g;
1345	if ($line =~ m/^\s*$/) {
1346	print HTML "<p>";
1347	} else {
1348	print HTML "<br> ", $line;
1349	}
1350	}
1351	print HTML "\n</body></html>\n";
1352
1353	close HTML;
1354	close TEXT;
1355
1356	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1357	return 1;
1358	}
1359
1360	# Convert any file to TEXT with a crude perl implementation of the
1361	# UNIX strings command.
1362	# Note - this assumes ascii charsets :( (jrm21)
1363
1364	sub any_to_text {
1365	my ($input_filename, $output_filestem) = @_;
1366
1367	if (!$use_strings) {
1368	return 0;
1369	}
1370
1371	print STDERR "\n** In any to text**\n\n";
1372	open(IN, "<$input_filename") \|\| return 0;
1373	binmode(IN);
1374	open(OUT, ">$output_filestem.text") \|\| return 0;
1375
1376	my ($line);
1377	my $output_line_count = 0;
1378	while (<IN>) {
1379	$line = $_;
1380
1381	# delete anything that isn't a printable character
1382	$line =~ s/[^\040-\176]+/\n/sg;
1383
1384	# delete any string less than 10 characters long
1385	$line =~ s/^.{0,9}$/\n/mg;
1386	while ($line =~ m/^.{1,9}$/m) {
1387	$line =~ s/^.{0,9}$/\n/mg;
1388	$line =~ s/\n+/\n/sg;
1389	}
1390
1391	# remove extraneous whitespace
1392	$line =~ s/\n+/\n/gs;
1393	$line =~ s/^\n//gs;
1394
1395	# output whatever is left
1396	if ($line =~ m/[^\n ]/) {
1397	print OUT $line;
1398	++$output_line_count;
1399	}
1400	}
1401
1402	close OUT;
1403	close IN;
1404
1405	if ($output_line_count) { # try to protect against binary only formats
1406	return 1;
1407	}
1408
1409	&FileUtils::removeFiles("$output_filestem.text");
1410	return 0;
1411
1412	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: