Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 32209

Last change on this file since 32209 was 32209, checked in by ak19, 6 years ago
Commented out the BITNESS env var for Windows introduced in the previous commit and not using it in gsConvert.pl since Win can use XPdf's bin32 version of pdftohtml on 64 bit machines too. The lines determining and setting the BITNESS env var in gs2build\setup.bat are only commented out since setting BITNESS on Win may be useful in the future and we don't want to reinvent the wheel
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 42.0 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use strict;
52
53	use parsargv;
54	use util;
55	use FileUtils;
56	use Cwd;
57
58	# Are we running on WinNT or Win2000 (or later)?
59	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62	my $use_strings;
63	my $pdf_complex;
64	my $pdf_nohidden;
65	my $pdf_zoom;
66	my $pdf_ignore_images;
67	my $pdf_allow_images_only;
68	my $windows_scripting;
69
70	sub print_usage
71	{
72	print STDERR "\n";
73	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
74	print STDERR " or text using third-party programs.\n\n";
75	print STDERR " usage: $0 [options] filename\n";
76	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
77	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
78	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
79	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
80	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
81	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
82	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
83	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
84	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
85	print STDERR "\t\tconverting PDF to HTML\n";
86	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
87	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
88	print STDERR "\t\t-pdf_complex is set\n";
89	exit(1);
90	}
91
92	my $faillogfile="";
93	my $timeout=0;
94	my $verbosity=0;
95
96	sub main
97	{
98	my (@ARGV) = @_;
99	my ($input_type,$output_type,$verbose);
100
101	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
102	# is in use or not
103	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
104	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
105	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
106	# Currently only have VBA for Word and PPT(but no XLS)
107	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
108
109	my $type_re = $default_type_re;
110
111	foreach my $a (@ARGV) {
112	if ($a =~ m/^windows_scripting$/i) {
113	$type_re = $enhanced_type_re;
114	}
115	}
116
117	# read command-line arguments
118	if (!parsargv::parse(\@ARGV,
119	"type/$type_re/", \$input_type,
120	'/errlog/.*/', \$faillogfile,
121	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
122	'timeout/\d+/0',\$timeout,
123	'verbose/\d+/0', \$verbose,
124	'windows_scripting',\$windows_scripting,
125	'use_strings', \$use_strings,
126	'pdf_complex', \$pdf_complex,
127	'pdf_ignore_images', \$pdf_ignore_images,
128	'pdf_allow_images_only', \$pdf_allow_images_only,
129	'pdf_nohidden', \$pdf_nohidden,
130	'pdf_zoom/\d+/2', \$pdf_zoom
131	))
132	{
133	print_usage();
134	}
135
136	$verbosity=$verbose if defined $verbose;
137
138	# Make sure the input file exists and can be opened for reading
139	if (scalar(@ARGV!=1)) {
140	print_usage();
141	}
142
143	my $input_filename = $ARGV[0];
144	if (!-r $input_filename) {
145	print STDERR "Error: unable to open $input_filename for reading\n";
146	exit(1);
147	}
148
149	# Deduce filenames
150	my ($tailname,$dirname,$suffix)
151	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
152	my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
153
154	if ($input_type eq "")
155	{
156	$input_type = lc (substr($suffix,1,length($suffix)-1));
157	}
158
159	# Change to temporary working directory
160	my $stored_dir = cwd();
161	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
162
163	# Select convert utility
164	if (!defined $input_type) {
165	print STDERR "Error: No filename extension or input type defined\n";
166	exit(1);
167	}
168	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
169	print &convertDOC($input_filename, $output_filestem, $output_type);
170	print "\n";
171	}
172	elsif ($input_type eq "rtf") {
173	print &convertRTF($input_filename, $output_filestem, $output_type);
174	print "\n";
175	}
176	elsif ($input_type eq "pdf") {
177	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
178	print "\n";
179	}
180	elsif ($input_type eq "ps") {
181	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
182	print "\n";
183	}
184	elsif ($input_type =~ m/pptx?$/) {
185	print &convertPPT($input_filename, $output_filestem, $output_type);
186	print "\n";
187	}
188	elsif ($input_type =~ m/xlsx?$/) {
189	print &convertXLS($input_filename, $output_filestem, $output_type);
190	print "\n";
191	}
192	else {
193	print STDERR "Error: Unable to convert type '$input_type'\n";
194	exit(1);
195	}
196
197	# restore to original working directory
198	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
199
200	}
201
202	&main(@ARGV);
203
204
205
206	# Document-type conversion functions
207	#
208	# The following functions attempt to convert documents from their
209	# input type to the specified output type. If no output type was
210	# given, then they first attempt HTML, and then TEXT.
211	#
212	# Each returns the output type ("html" or "text") or "fail" if no
213	# conversion is possible.
214
215	# Convert a Microsoft word document
216
217	sub convertDOC {
218	my ($input_filename, $output_filestem, $output_type) = @_;
219
220	# Many .doc files are not in fact word documents!
221	my $realtype = &find_docfile_type($input_filename);
222
223	if ($realtype eq "word6" \|\| $realtype eq "word7"
224	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
225	return &convertWord678($input_filename, $output_filestem, $output_type);
226	} elsif ($realtype eq "rtf") {
227	return &convertRTF($input_filename, $output_filestem, $output_type);
228	} else {
229	return &convertAnything($input_filename, $output_filestem, $output_type);
230	}
231	}
232
233	# Convert a Microsoft word 6/7/8 document
234
235	sub convertWord678 {
236	my ($input_filename, $output_filestem, $output_type) = @_;
237
238	my $success = 0;
239	if (!$output_type \|\| ($output_type =~ m/html/i)){
240	if ($windows_scripting) {
241	$success = &native_doc_to_html($input_filename, $output_filestem);
242	}
243	else {
244	$success = &doc_to_html($input_filename, $output_filestem);
245	}
246	if ($success) {
247	return "html";
248	}
249	}
250	return &convertAnything($input_filename, $output_filestem, $output_type);
251	}
252
253
254	# Convert a Rich Text Format (RTF) file
255
256	sub convertRTF {
257	my ($input_filename, $output_filestem, $output_type) = @_;
258
259	my $success = 0;
260
261	# Attempt specialised conversion to HTML
262	if (!$output_type \|\| ($output_type =~ m/html/i)) {
263
264	if ($windows_scripting) {
265	$success = &native_doc_to_html($input_filename, $output_filestem);
266	}
267	else {
268	$success = &rtf_to_html($input_filename, $output_filestem);
269	}
270	if ($success) {
271	return "html";
272	}
273	}
274
275	# rtf is so ugly that's it's not worth running strings over.
276	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
277	# return &convertAnything($input_filename, $output_filestem, $output_type);
278	return "fail";
279	}
280
281
282	# Convert an unidentified file
283
284	sub convertAnything {
285	my ($input_filename, $output_filestem, $output_type) = @_;
286
287	my $success = 0;
288
289	# Attempt simple conversion to HTML
290	if (!$output_type \|\| ($output_type =~ m/html/i)) {
291	$success = &any_to_html($input_filename, $output_filestem);
292	if ($success) {
293	return "html";
294	}
295	}
296
297	# Convert to text
298	if (!$output_type \|\| ($output_type =~ m/text/i)) {
299	$success = &any_to_text($input_filename, $output_filestem);
300	if ($success) {
301	return "text";
302	}
303	}
304	return "fail";
305	}
306
307
308
309	# Convert an Adobe PDF document
310
311	sub convertPDF {
312	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
313
314	my $success = 0;
315	$output_type =~ s/.\-(.)/$1/i;
316	# Attempt coversion to Image
317	if ($output_type =~ m/jp?g\|gif\|png/i) {
318	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
319	if ($success){
320	return "item";
321	}
322	}
323
324	# Attempt conversion to HTML
325	# Uses the old pdftohtml that doesn't work for newer PDF versions
326	#if ($output_type =~ m/^html/i) {
327	if (!$output_type \|\| ($output_type =~ m/^html/i)) {
328	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
329	if ($success) {
330	return "html";
331	}
332	}
333
334	# Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This
335	# will be the new default for PDFs when output_type for PDF docs is not specified
336	# (once our use of xpdftools' pdftohtml has been implemented on win and mac).
337	if ($output_type =~ m/paged_html/i) {
338	#if (!$output_type \|\| ($output_type =~ m/paged_html/i)) {
339	$success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
340	if ($success) {
341	return "paged_html";
342	}
343	}
344
345	# Attempt conversion to TEXT
346	if (!$output_type \|\| ($output_type =~ m/text/i)) {
347	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
348	if ($success) {
349	return "text";
350	}
351	}
352
353	return "fail";
354
355	}
356
357
358	# Convert an Adobe PostScript document
359
360	sub convertPS {
361	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
362
363	my $success = 0;
364	$output_type =~ s/.\-(.)/$1/i;
365	# Attempt coversion to Image
366	if ($output_type =~ m/jp?g\|gif\|png/i) {
367	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
368	if ($success){
369	return "item";
370	}
371	}
372
373	# Attempt conversion to TEXT
374	if (!$output_type \|\| ($output_type =~ m/text/i)) {
375	$success = &ps_to_text($input_filename, $output_filestem);
376	if ($success) {
377	return "text";
378	}
379	}
380	return "fail";
381	}
382
383
384	sub convertPPT {
385	my ($input_filename, $output_filestem, $output_type) = @_;
386	my $success = 0;
387
388	my $ppt_convert_type = "";
389
390	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
391	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
392	if ($output_type =~ m/gif/i) {
393	$ppt_convert_type = "-g";
394	} elsif ($output_type =~ m/jp?g/i){
395	$ppt_convert_type = "-j";
396	} elsif ($output_type =~ m/png/i){
397	$ppt_convert_type = "-p";
398	}
399	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
400	$ENV{'GSDLOS'}, "pptextract");
401	$vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
402	# $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
403
404	my $cmd = "";
405	if ($timeout) {$cmd = "ulimit -t $timeout;";}
406	# if the converting directory already exists
407	if (-d $output_filestem) {
408	print STDERR "**The conversion directory already exists\n";
409	return "item";
410	} else {
411	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
412	$cmd .= " 2>\"$output_filestem.err\""
413	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
414
415	if (system($cmd) !=0) {
416	print STDERR "Powerpoint VB Scripting convert failed\n";
417	} else {
418	return "item";
419	}
420	}
421	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
422	# Attempt conversion to HTML
423	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
424	# formulate the command
425	my $cmd = "";
426	my $full_perl_path = &util::get_perl_exec();
427	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
428	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
429	$cmd .= " 2>\"$output_filestem.err\""
430	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
431
432	# execute the command
433	$!=0;
434	if (system($cmd)!=0)
435	{
436	print STDERR "Powerpoint 95/97 converter failed $!\n";
437	} else {
438	return "html";
439	}
440	}
441
442	$success = &any_to_text($input_filename, $output_filestem);
443	if ($success) {
444	return "text";
445	}
446
447	return "fail";
448	}
449
450
451	sub convertXLS {
452	my ($input_filename, $output_filestem, $output_type) = @_;
453
454	my $success = 0;
455
456	# Attempt conversion to HTML
457	if (!$output_type \|\| ($output_type =~ m/html/i)) {
458	# formulate the command
459	my $cmd = "";
460	my $full_perl_path = &util::get_perl_exec();
461	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
462	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
463	$cmd .= " 2>\"$output_filestem.err\""
464	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
465
466
467	# execute the command
468	$!=0;
469	if (system($cmd)!=0)
470	{
471	print STDERR "Excel 95/97 converter failed $!\n";
472	} else {
473	return "html";
474	}
475	}
476
477	$success = &any_to_text($input_filename, $output_filestem);
478	if ($success) {
479	return "text";
480	}
481
482	return "fail";
483	}
484
485
486
487	# Find the real type of a .doc file
488	#
489	# We seem to have a lot of files with a .doc extension that are .rtf
490	# files or Word 5 files. This function attempts to tell the difference.
491	sub find_docfile_type {
492	my ($input_filename) = @_;
493
494	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
495	return "docx";
496	}
497
498	open(CHK, "<$input_filename");
499	binmode(CHK);
500	my $line = "";
501	my $first = 1;
502
503	while (<CHK>) {
504
505	$line = $_;
506
507	if ($first) {
508	# check to see if this is an rtf file
509	if ($line =~ m/^\{\\rtf/) {
510	close(CHK);
511	return "rtf";
512	}
513	$first = 0;
514	}
515
516	# is this is a word 6/7/8 document?
517	if ($line =~ m/Word\.Document\.([678])/) {
518	close(CHK);
519
520	return "word$1";
521	}
522
523	}
524
525	return "unknown";
526	}
527
528
529	# Specific type-to-type conversions
530	#
531	# Each of the following functions attempts to convert a document from
532	# a specific format to another. If they succeed they return 1 and leave
533	# the output document(s) in the appropriate place; if they fail they
534	# return 0 and delete any working files.
535
536
537	# Attempt to convert a word document to html with the wv program
538	sub doc_to_html {
539	my ($input_filename, $output_filestem) = @_;
540
541	my $wvware_status = 0;
542
543	# need to ensure that the path to perl is quoted (in case there's spaces in it)
544	my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
545
546	# print STDERR "***** wvware launch cmd = $launch_cmd\n";
547
548	$wvware_status = system($launch_cmd)/256;
549	return $wvware_status;
550	}
551
552	# Attempt to convert a word document to html with the word2html scripting program
553	sub native_doc_to_html {
554	my ($input_filename, $output_filestem) = @_;
555
556	# build up the path to the doc-to-html conversion tool we're going to use
557	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
558
559	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
560	# if windows scripting with docx input, use new VBscript to get the local Word install (if
561	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
562
563	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
564	# else script launch fails when there are error msgs
565	$vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
566	$vbScript = "CScript //Nologo \"$vbScript\""; # launch with CScript for error output in STDERR
567	# //Nologo flag avoids Microsoft's opening/logo msgs
568	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
569	print STDERR " This may take some time. Please wait...\n";
570	}
571	else { # old doc versions. use the usual VB executable word2html for the
572	# conversion. Doesn't need full path, since bin\windows is on PATH
573	$vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
574	}
575	}
576	else { # not windows
577	$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
578	}
579
580	if (-e "$output_filestem.html") {
581	print STDERR " The conversion file:\n";
582	print STDERR " $output_filestem.html\n";
583	print STDERR " ... already exists. Skipping\n";
584	return 1;
585	}
586
587	my $cmd = "";
588	if ($timeout) {$cmd = "ulimit -t $timeout;";}
589	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
590	#$cmd .= "$vbScript $input_filename $output_filestem.html";
591	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
592
593	# redirecting STDERR
594
595	$cmd .= " 2> \"$output_filestem.err\""
596	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
597	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
598
599	# execute the command
600	$!=0;
601	if (system($cmd)!=0)
602	{
603	print STDERR "Error executing $vbScript converter:$!\n";
604	if (-s "$output_filestem.err") {
605	open (ERRFILE, "<$output_filestem.err");
606
607	my $write_to_fail_log=0;
608	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
609	{$write_to_fail_log=1;}
610
611	my $line;
612	while ($line=<ERRFILE>) {
613	if ($line =~ m/\w/) {
614	print STDERR "$line";
615	print FAILLOG "$line" if ($write_to_fail_log);
616	}
617	if ($line !~ m/startup error/) {next;}
618	print STDERR " (given an invalid .DOC file?)\n";
619	print FAILLOG " (given an invalid .DOC file?)\n"
620	if ($write_to_fail_log);
621
622	} # while ERRFILE
623	close FAILLOG if ($write_to_fail_log);
624	}
625	return 0; # we can try any_to_text
626	}
627
628	# Was the conversion successful?
629	if (-s "$output_filestem.html") {
630	open(TMP, "$output_filestem.html");
631	my $line = <TMP>;
632	close(TMP);
633	if ($line && $line =~ m/html/i) {
634	&FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
635	return 1;
636	}
637	}
638
639	# If here, an error of some sort occurred
640	&FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
641	if (-e "$output_filestem.err") {
642	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
643	open (ERRLOG,"$output_filestem.err");
644	while (<ERRLOG>) {print FAILLOG $_;}
645	close FAILLOG;
646	close ERRLOG;
647	}
648	&FileUtils::removeFiles("$output_filestem.err");
649	}
650	return 0;
651	}
652
653	# Attempt to convert an RTF document to html with rtftohtml
654	sub rtf_to_html {
655	my ($input_filename, $output_filestem) = @_;
656
657	# formulate the command
658	my $cmd = "";
659	if ($timeout) {$cmd = "ulimit -t $timeout;";}
660	$cmd .= "rtftohtml";
661	#$cmd .= "rtf-converter";
662
663	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
664
665	$cmd .= " 2>\"$output_filestem.err\""
666	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
667
668
669	# execute the command
670	$!=0;
671	if (system($cmd)!=0)
672	{
673	print STDERR "Error executing rtf converter $!\n";
674	# don't currently bother printing out error log...
675	# keep going, in case it still created an HTML file...
676	}
677
678	# Was the conversion successful?
679	my $was_successful=0;
680	if (-s "$output_filestem.html") {
681	# make sure we have some content other than header
682	open (HTML, "$output_filestem.html"); # what to do if fail?
683	my $line;
684	my $past_header=0;
685	while ($line=<HTML>) {
686
687	if ($past_header == 0) {
688	if ($line =~ m/<body>/) {$past_header=1;}
689	next;
690	}
691
692	$line =~ s/<[^>]+>//g;
693	if ($line =~ m/\w/ && $past_header) { # we found some content...
694	$was_successful=1;
695	last;
696	}
697	}
698	close HTML;
699	}
700
701	if ($was_successful) {
702	&FileUtils::removeFiles("$output_filestem.err")
703	if (-e "$output_filestem.err");
704	# insert the (modified) table of contents, if it exists.
705	if (-e "${output_filestem}_ToC.html") {
706	&FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
707	my $open_failed=0;
708	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
709	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
710	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
711
712	if ($open_failed) {
713	close HTMLSRC;
714	close TOC;
715	close HTML;
716	&FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
717	return 1;
718	}
719
720	# print out header info from src html.
721	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
722	print HTML "$_";
723	}
724
725	# print out table of contents, making links relative
726	<TOC>; <TOC>; # ignore first 2 lines
727	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
728	my $line;
729	while ($line=<TOC>) {
730	$line =~ s@</body></html>$@@i ; # only last line has this
731	# make link relative
732	$line =~ s@href=\"[^\#]+@href=\"@i;
733	print HTML $line;
734	}
735	close TOC;
736
737	# rest of html src
738	while (<HTMLSRC>) {
739	print HTML $_;
740	}
741	close HTMLSRC;
742	close HTML;
743
744	&FileUtils::removeFiles("${output_filestem}_ToC.html");
745	&FileUtils::removeFiles("${output_filestem}.src");
746	}
747	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
748	return 1; # success
749	}
750
751	if (-e "$output_filestem.err") {
752	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
753	{
754	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
755	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
756	print FAILLOG " (rtf file might be too recent):\n";
757	open (ERRLOG, "$output_filestem.err");
758	while (<ERRLOG>) {print FAILLOG $_;}
759	close ERRLOG;
760	close FAILLOG;
761	}
762	&FileUtils::removeFiles("$output_filestem.err");
763	}
764
765	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
766
767	return 0;
768	}
769
770
771	# Convert a pdf file to html with the old pdftohtml command
772	# which only works for older PDF versions
773	sub pdf_to_html {
774	my ($dirname, $input_filename, $output_filestem) = @_;
775
776	my $cmd = "";
777	if ($timeout) {$cmd = "ulimit -t $timeout;";}
778	my $full_perl_path = &util::get_perl_exec();
779	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
780	$cmd .= " -c" if ($pdf_complex);
781	$cmd .= " -i" if ($pdf_ignore_images);
782	$cmd .= " -a" if ($pdf_allow_images_only);
783	$cmd .= " -hidden" unless ($pdf_nohidden);
784	$cmd .= " \"$input_filename\" \"$output_filestem\"";
785
786	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
787	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
788	} else {
789	$cmd .= " > \"$output_filestem.err\"";
790	}
791
792	$!=0;
793
794	my $retval=system($cmd);
795	if ($retval!=0)
796	{
797	print STDERR "Error executing pdftohtml.pl";
798	if ($!) {print STDERR ": $!";}
799	print STDERR "\n";
800	}
801
802	# make sure the converter made something
803	if ($retval!=0 \|\| ! -s "$output_filestem.html")
804	{
805	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
806	# print out the converter's std err, if any
807	if (-s "$output_filestem.err") {
808	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
809	print STDERR "pdftohtml error log:\n";
810	while (<ERRLOG>) {
811	print STDERR "$_";
812	}
813	close ERRLOG;
814	}
815	#print STDERR "***********output filestem $output_filestem.html\n";
816	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
817	if (-e "$output_filestem.err") {
818	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
819	{
820	open (ERRLOG, "$output_filestem.err");
821	while (<ERRLOG>) {print FAILLOG $_;}
822	close ERRLOG;
823	close FAILLOG;
824	}
825	&FileUtils::removeFiles("$output_filestem.err");
826	}
827	return 0;
828	}
829
830	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
831	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
832	return 1;
833	}
834
835
836	# Convert a pdf file to html with the newer Xpdftools' pdftohtml
837	# This generates "paged HTML" where extracted, selectable text is positioned
838	# over screenshots of each page.
839	# Since xpdf's pdftohtml fails if the output dir already exists and for easier
840	# naming, the output files are created in a "pages" subdirectory of the tmp
841	# location parent of $output_filestem instead
842	sub xpdf_to_html {
843	my ($dirname, $input_filename, $output_filestem) = @_;
844
845	my $cmd = "";
846
847	# build up the path to the doc-to-html conversion tool we're going to use
848	my $xpdf_pdftohtml = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools");
849
850	if($ENV{'GSDLOS'} =~ m/^windows$/i) { # For Windows, just use the 32 bit xpdf's pdftohtml as it works the same as the 64 bit
851	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32");
852	} elsif ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
853	# TODO
854	} else { # unix or windows, use the appropriate bin folder for the bitness of the system
855	# In fact, when testing 3 different PDF docs, it doesn't seem to make a difference on
856	# 64 bit Windows whether the pdftohtml binary in the bin32 or bin64 folder is used.
857	# However, maybe we'll use another xpdf-tool too in future where bitness will be relevant.
858
859	# Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since
860	# $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit
861	# specific subdirectories exist in a greenstone installation.
862	# None of those locations need exist when xpdf-tools is installed with GS.
863	# So don't depend on GSDLARCH as forcing that to be exported has side-effects
864	if($ENV{'BITNESS'}) {
865	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin".$ENV{'BITNESS'});
866	} else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64?
867	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32");
868	}
869	}
870
871	# We'll create the file by name $output_filestem during post-conversion processing.
872	# Note that Xpdf tools will only create its conversion products in a dir that does
873	# not yet exist. So we'll create this location as a subdir of the output_filestem's
874	# parent directory. The parent dir is the already generated tmp area for conversion. So:
875	# - tmpdir gs2build/tmp/<random-num> already exists at this stage
876	# - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
877	# - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
878	my ($tailname, $tmp_dirname, $suffix)
879	= &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
880	$tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
881
882	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "pdftohtml");
883	# xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
884	$cmd .= "\"$xpdf_pdftohtml\"";
885	$cmd .= " -z $pdf_zoom" if ($pdf_zoom);
886	# $cmd .= " -c" if ($pdf_complex);
887	# $cmd .= " -i" if ($pdf_ignore_images);
888	# $cmd .= " -a" if ($pdf_allow_images_only);
889	# $cmd .= " -hidden" unless ($pdf_nohidden);
890	$cmd .= " \"$input_filename\" \"$tmp_dirname\"";
891	#$cmd .= " \"$input_filename\" \"$output_filestem\"";
892
893	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
894	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
895	} else {
896	$cmd .= " > \"$output_filestem.err\"";
897	}
898
899	#print STDERR "@@@@ Running command: $cmd\n";
900
901	$!=0;
902	my $retval=system($cmd);
903	if ($retval!=0)
904	{
905	print STDERR "Error executing xpdf's pdftohtml tool";
906	if ($!) {print STDERR ": $!";}
907	print STDERR "\n";
908	}
909
910	# make sure the converter made something
911	if ($retval!=0 \|\| ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
912	{
913	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
914	# print out the converter's std err, if any
915	if (-s "$output_filestem.err") {
916	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
917	print STDERR "pdftohtml error log:\n";
918	while (<ERRLOG>) {
919	print STDERR "$_";
920	}
921	close ERRLOG;
922	}
923	#print STDERR "***********output filestem $output_filestem.html\n";
924	&FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
925	if (-e "$output_filestem.err") {
926	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
927	{
928	open (ERRLOG, "$output_filestem.err");
929	while (<ERRLOG>) {print FAILLOG $_;}
930	close ERRLOG;
931	close FAILLOG;
932	}
933	&FileUtils::removeFiles("$output_filestem.err");
934	}
935	return 0;
936	}
937
938	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
939	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
940	return 1;
941	}
942
943
944
945	# Convert a pdf file to various types of image with the convert command
946
947	sub pdfps_to_img {
948	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
949
950	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
951	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
952	my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
953	$imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
954	my $result = `$imagick_cmd identify 2>&1`;
955
956	# Linux and Windows return different values for "program not found".
957	# Linux returns -1 and Windows 256 for "program not found". But once they're
958	# converted to signed values, it will be -1 for Linux and 1 for Windows.
959	# Whenever we test for return values other than 0, shift by 8 and perform
960	# unsigned to signed status conversion on $? to get expected range of return vals
961	# Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
962	# and then exits on that, by the time we get here, we need to do it again
963	my $status = $?;
964	$status >>= 8;
965	$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
966	if (($ENV{'GSDLOS'} ne "windows" && $status == -1) \|\| ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
967	# if ($status == -1 \|\| $status == 1) #if ($status == -1 \|\| $status == 256) {
968	#ImageMagick is not installed, thus the convert utility is not available.
969	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
970	return 0;
971	}
972	}
973
974	my $cmd = "";
975	if ($timeout) {$cmd = "ulimit -t $timeout;";}
976	$output_type =~ s/.\_(.)/$1/i;
977	my $full_perl_path = &util::get_perl_exec();
978	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
979	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
980	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
981	} else {
982	$cmd .= " > \"$output_filestem.err\"";
983	}
984
985	# don't include path on windows (to avoid having to play about
986	# with quoting when GSDLHOME might contain spaces) but assume
987	# that the PATH is set up correctly
988	$!=0;
989	my $retval=system($cmd);
990	if ($retval!=0)
991	{
992	print STDERR "Error executing pdfpstoimg.pl";
993	if ($!) {print STDERR ": $!";}
994	print STDERR "\n";
995	}
996
997	#make sure the converter made something
998	#if ($retval !=0) \|\| ! -s "$output_filestem")
999	if ($retval !=0)
1000	{
1001	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1002	#print out the converter's std err, if any
1003	if (-s "$output_filestem.err") {
1004	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1005	print STDERR "pdfpstoimg error log:\n";
1006	while (<ERRLOG>) {
1007	print STDERR "$_";
1008	}
1009	close ERRLOG;
1010	}
1011	#&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
1012	if (-e "$output_filestem.err") {
1013	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1014	{
1015	open (ERRLOG, "$output_filestem.err");
1016	while (<ERRLOG>) {print FAILLOG $_;}
1017	close ERRLOG;
1018	close FAILLOG;
1019	}
1020	&FileUtils::removeFiles("$output_filestem.err");
1021	}
1022	return 0;
1023	}
1024	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1025	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1026	return 1;
1027	}
1028
1029	# Convert a PDF file to text with the pdftotext command
1030
1031	sub pdf_to_text {
1032	my ($dirname, $input_filename, $output_filestem) = @_;
1033
1034	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1035
1036	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1037	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1038	} else {
1039	$cmd .= " > \"$output_filestem.err\"";
1040	}
1041
1042	if (system($cmd)!=0)
1043	{
1044	print STDERR "Error executing $cmd: $!\n";
1045	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1046	}
1047
1048	# make sure there is some extracted text.
1049	if (-e "$output_filestem.text") {
1050	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
1051	binmode(EXTR_TEXT); # just in case...
1052	my $line="";
1053	my $seen_text=0;
1054	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1055	if ($line=~ m/\w/) {$seen_text=1;}
1056	}
1057	close EXTR_TEXT;
1058	if ($seen_text==0) { # no text was extracted
1059	print STDERR "Error: pdftotext found no text\n";
1060	&FileUtils::removeFiles("$output_filestem.text");
1061	}
1062	}
1063
1064	# make sure the converter made something
1065	if (! -s "$output_filestem.text")
1066	{
1067	# print out the converters std err, if any
1068	if (-s "$output_filestem.err") {
1069	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1070	print STDERR "pdftotext error log:\n";
1071	while (<ERRLOG>) {
1072	print STDERR "$_";
1073	}
1074	close ERRLOG;
1075	}
1076	# does this converter create a .out file?
1077	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1078	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1079	if (-e "$output_filestem.err") {
1080	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1081	{
1082	open (ERRLOG,"$output_filestem.err");
1083	while (<ERRLOG>) {print FAILLOG $_;}
1084	close ERRLOG;
1085	close FAILLOG;
1086	}
1087	&FileUtils::removeFiles("$output_filestem.err");
1088	}
1089	return 0;
1090	}
1091	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1092	return 1;
1093	}
1094
1095	# Convert a PostScript document to text
1096	# note - just using "ps2ascii" isn't good enough, as it
1097	# returns 0 for a postscript interpreter error. ps2ascii is just
1098	# a wrapper to "gs" anyway, so we use that cmd here.
1099
1100	sub ps_to_text {
1101	my ($input_filename, $output_filestem) = @_;
1102
1103	my $error = "";
1104
1105	# if we're on windows we'll fall straight through without attempting
1106	# to use gs
1107	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1108	$error = "Windows does not support gs";
1109
1110	} else {
1111	my $cmd = "";
1112	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1113	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1114	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1115	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1116	$cmd .= " 2> $output_filestem.err";
1117	$!=0;
1118
1119	my $retcode=system($cmd);
1120	$retcode = $? >> 8; # see man perlfunc - system for this...
1121	# if system returns -1 \| 127 (couldn't start program), look at $! for message
1122
1123	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1124	elsif (! -e "$output_filestem.text") {
1125	$error="did not create output file.\n";
1126	}
1127	else
1128	{ # make sure the interpreter didn't get an error. It is technically
1129	# possible for the actual text to start with this, but....
1130	open PSOUT, "$output_filestem.text";
1131	if (<PSOUT> =~ m/^Error: (.*)/) {
1132	$error="interpreter error - \"$1\"";
1133	}
1134	close PSOUT;
1135	}
1136	}
1137
1138	if ($error ne "")
1139	{
1140	print STDERR "Warning: Error executing gs: $error\n";
1141	print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
1142	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1143
1144	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1145	{
1146	print FAILLOG "gs - $error\n";
1147	if (-e "$output_filestem.err") {
1148	open(ERRLOG, "$output_filestem.err");
1149	while (<ERRLOG>) {print FAILLOG $_;}
1150	close ERRLOG;
1151	}
1152	close FAILLOG;
1153	}
1154	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1155
1156
1157	# Fine then. We'll just do a lousy job by ourselves...
1158	# Based on 5-line regexp sed script found at:
1159	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1160	#
1161	print STDERR "Stripping text from postscript\n";
1162	my $errorcode=0;
1163	open (IN, "$input_filename")
1164	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1165	open (OUT, ">$output_filestem.text")
1166	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1167	if ($errorcode) {print STDERR "errors\n";return 0;}
1168
1169	my $text=""; # this is for whole .ps file...
1170	$text = join('', <IN>); # see man perlport, under "System Resources"
1171	close IN;
1172
1173	# Make sure this is a ps file...
1174	if ($text !~ m/^%!/) {
1175	print STDERR "Bad postscript header: not '%!'\n";
1176	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1177	{
1178	print FAILLOG "Bad postscript header: not '%!'\n";
1179	close FAILLOG;
1180	}
1181	return 0;
1182	}
1183
1184	# if ps has Page data, then use it to delete all stuff before it.
1185	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1186
1187	# remove all leading non-data stuff
1188	$text =~ s/^.*?\(//s;
1189
1190	# remove all newline chars for easier processing
1191	$text =~ s/\n//g;
1192
1193	# Big assumption here - assume that if any co-ordinates are
1194	# given, then we are at the end of a sentence.
1195	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1196
1197	# special characters--
1198	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1199
1200	# ? ps text formatting (eg italics?) ?
1201	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1202	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1203	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1204	# default - remove the rest
1205	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1206
1207	# attempt to add whitespace between words...
1208	# this is based purely on observation, and may be completely wrong...
1209	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1210	# eg I notice "b(" is sometimes NOT a space if preceded by a
1211	# negative number.
1212	$text =~ s/\)\d+ ?b\(/\) \( /g;
1213
1214	# change quoted braces to brackets
1215	$text =~ s/([^\\])\\\(/$1\{/g;
1216	$text =~ s/([^\\])\\\)/$1\}/g ;
1217
1218	# remove everything that is not between braces
1219	$text =~ s/\)([^\(\)])+?\(//sg ;
1220
1221	# remove any Trailer eof stuff.
1222	$text =~ s/\)[^\)]*$//sg;
1223
1224	### ligatures have special characters...
1225	$text =~ s/\\013/ff/g;
1226	$text =~ s/\\014/fi/g;
1227	$text =~ s/\\015/fl/g;
1228	$text =~ s/\\016/ffi/g;
1229	$text =~ s/\\214/fi/g;
1230	$text =~ s/\\215/fl/g;
1231	$text =~ s/\\017/\n\* /g; # asterisk?
1232	$text =~ s/\\023/\023/g; # e acute ('e)
1233	$text =~ s/\\177/\252/g; # u"
1234	# $text =~ s/ ?? /\344/g; # a"
1235
1236	print OUT "$text";
1237	close OUT;
1238	}
1239	# wrap the text - use a minimum length. ie, first space after this length.
1240	my $wrap_length=72;
1241	&FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1242	open INFILE, "$output_filestem.text.tmp" \|\|
1243	die "Couldn't open file: $!";
1244	open OUTFILE, ">$output_filestem.text" \|\|
1245	die "Couldn't open file for writing: $!";
1246	my $line="";
1247	while ($line=<INFILE>) {
1248	while (length($line)>0) {
1249	if (length($line)>$wrap_length) {
1250	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1251	print OUTFILE "$1\n";
1252	} else {
1253	print OUTFILE "$line";
1254	$line="";
1255	}
1256	}
1257	}
1258	close INFILE;
1259	close OUTFILE;
1260	&FileUtils::removeFiles("$output_filestem.text.tmp");
1261
1262	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1263	return 1;
1264	}
1265
1266
1267	# Convert any file to HTML with a crude perl implementation of the
1268	# UNIX strings command.
1269
1270	sub any_to_html {
1271	my ($input_filename, $output_filestem) = @_;
1272
1273	# First generate a text file
1274	return 0 unless (&any_to_text($input_filename, $output_filestem));
1275
1276	# create an HTML file from the text file
1277	open(TEXT, "<$output_filestem.text");
1278	open(HTML, ">$output_filestem.html");
1279
1280	print HTML "<html><head>\n";
1281	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1282	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1283	print HTML "</head><body>\n\n";
1284
1285	my $line;
1286	while ($line=<TEXT>) {
1287	$line =~ s/</</g;
1288	$line =~ s/>/>/g;
1289	if ($line =~ m/^\s*$/) {
1290	print HTML "<p>";
1291	} else {
1292	print HTML "<br> ", $line;
1293	}
1294	}
1295	print HTML "\n</body></html>\n";
1296
1297	close HTML;
1298	close TEXT;
1299
1300	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1301	return 1;
1302	}
1303
1304	# Convert any file to TEXT with a crude perl implementation of the
1305	# UNIX strings command.
1306	# Note - this assumes ascii charsets :( (jrm21)
1307
1308	sub any_to_text {
1309	my ($input_filename, $output_filestem) = @_;
1310
1311	if (!$use_strings) {
1312	return 0;
1313	}
1314
1315	print STDERR "\n** In any to text**\n\n";
1316	open(IN, "<$input_filename") \|\| return 0;
1317	binmode(IN);
1318	open(OUT, ">$output_filestem.text") \|\| return 0;
1319
1320	my ($line);
1321	my $output_line_count = 0;
1322	while (<IN>) {
1323	$line = $_;
1324
1325	# delete anything that isn't a printable character
1326	$line =~ s/[^\040-\176]+/\n/sg;
1327
1328	# delete any string less than 10 characters long
1329	$line =~ s/^.{0,9}$/\n/mg;
1330	while ($line =~ m/^.{1,9}$/m) {
1331	$line =~ s/^.{0,9}$/\n/mg;
1332	$line =~ s/\n+/\n/sg;
1333	}
1334
1335	# remove extraneous whitespace
1336	$line =~ s/\n+/\n/gs;
1337	$line =~ s/^\n//gs;
1338
1339	# output whatever is left
1340	if ($line =~ m/[^\n ]/) {
1341	print OUT $line;
1342	++$output_line_count;
1343	}
1344	}
1345
1346	close OUT;
1347	close IN;
1348
1349	if ($output_line_count) { # try to protect against binary only formats
1350	return 1;
1351	}
1352
1353	&FileUtils::removeFiles("$output_filestem.text");
1354	return 0;
1355
1356	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: