Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 32220

Last change on this file since 32220 was 32220, checked in by ak19, 6 years ago
Minor changes to get xpdf-tools' pdftohtml to work for mac too.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 41.7 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use strict;
52
53	use parsargv;
54	use util;
55	use FileUtils;
56	use Cwd;
57
58	# Are we running on WinNT or Win2000 (or later)?
59	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62	my $use_strings;
63	my $pdf_complex;
64	my $pdf_nohidden;
65	my $pdf_zoom;
66	my $pdf_ignore_images;
67	my $pdf_allow_images_only;
68	my $windows_scripting;
69
70	sub print_usage
71	{
72	print STDERR "\n";
73	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
74	print STDERR " or text using third-party programs.\n\n";
75	print STDERR " usage: $0 [options] filename\n";
76	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
77	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
78	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
79	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
80	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
81	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
82	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
83	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
84	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
85	print STDERR "\t\tconverting PDF to HTML\n";
86	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
87	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
88	print STDERR "\t\t-pdf_complex is set\n";
89	exit(1);
90	}
91
92	my $faillogfile="";
93	my $timeout=0;
94	my $verbosity=0;
95
96	sub main
97	{
98	my (@ARGV) = @_;
99	my ($input_type,$output_type,$verbose);
100
101	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
102	# is in use or not
103	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
104	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
105	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
106	# Currently only have VBA for Word and PPT(but no XLS)
107	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
108
109	my $type_re = $default_type_re;
110
111	foreach my $a (@ARGV) {
112	if ($a =~ m/^windows_scripting$/i) {
113	$type_re = $enhanced_type_re;
114	}
115	}
116
117	# read command-line arguments
118	if (!parsargv::parse(\@ARGV,
119	"type/$type_re/", \$input_type,
120	'/errlog/.*/', \$faillogfile,
121	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
122	'timeout/\d+/0',\$timeout,
123	'verbose/\d+/0', \$verbose,
124	'windows_scripting',\$windows_scripting,
125	'use_strings', \$use_strings,
126	'pdf_complex', \$pdf_complex,
127	'pdf_ignore_images', \$pdf_ignore_images,
128	'pdf_allow_images_only', \$pdf_allow_images_only,
129	'pdf_nohidden', \$pdf_nohidden,
130	'pdf_zoom/\d+/2', \$pdf_zoom
131	))
132	{
133	print_usage();
134	}
135
136	$verbosity=$verbose if defined $verbose;
137
138	# Make sure the input file exists and can be opened for reading
139	if (scalar(@ARGV!=1)) {
140	print_usage();
141	}
142
143	my $input_filename = $ARGV[0];
144	if (!-r $input_filename) {
145	print STDERR "Error: unable to open $input_filename for reading\n";
146	exit(1);
147	}
148
149	# Deduce filenames
150	my ($tailname,$dirname,$suffix)
151	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
152	my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
153
154	if ($input_type eq "")
155	{
156	$input_type = lc (substr($suffix,1,length($suffix)-1));
157	}
158
159	# Change to temporary working directory
160	my $stored_dir = cwd();
161	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
162
163	# Select convert utility
164	if (!defined $input_type) {
165	print STDERR "Error: No filename extension or input type defined\n";
166	exit(1);
167	}
168	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
169	print &convertDOC($input_filename, $output_filestem, $output_type);
170	print "\n";
171	}
172	elsif ($input_type eq "rtf") {
173	print &convertRTF($input_filename, $output_filestem, $output_type);
174	print "\n";
175	}
176	elsif ($input_type eq "pdf") {
177	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
178	print "\n";
179	}
180	elsif ($input_type eq "ps") {
181	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
182	print "\n";
183	}
184	elsif ($input_type =~ m/pptx?$/) {
185	print &convertPPT($input_filename, $output_filestem, $output_type);
186	print "\n";
187	}
188	elsif ($input_type =~ m/xlsx?$/) {
189	print &convertXLS($input_filename, $output_filestem, $output_type);
190	print "\n";
191	}
192	else {
193	print STDERR "Error: Unable to convert type '$input_type'\n";
194	exit(1);
195	}
196
197	# restore to original working directory
198	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
199
200	}
201
202	&main(@ARGV);
203
204
205
206	# Document-type conversion functions
207	#
208	# The following functions attempt to convert documents from their
209	# input type to the specified output type. If no output type was
210	# given, then they first attempt HTML, and then TEXT.
211	#
212	# Each returns the output type ("html" or "text") or "fail" if no
213	# conversion is possible.
214
215	# Convert a Microsoft word document
216
217	sub convertDOC {
218	my ($input_filename, $output_filestem, $output_type) = @_;
219
220	# Many .doc files are not in fact word documents!
221	my $realtype = &find_docfile_type($input_filename);
222
223	if ($realtype eq "word6" \|\| $realtype eq "word7"
224	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
225	return &convertWord678($input_filename, $output_filestem, $output_type);
226	} elsif ($realtype eq "rtf") {
227	return &convertRTF($input_filename, $output_filestem, $output_type);
228	} else {
229	return &convertAnything($input_filename, $output_filestem, $output_type);
230	}
231	}
232
233	# Convert a Microsoft word 6/7/8 document
234
235	sub convertWord678 {
236	my ($input_filename, $output_filestem, $output_type) = @_;
237
238	my $success = 0;
239	if (!$output_type \|\| ($output_type =~ m/html/i)){
240	if ($windows_scripting) {
241	$success = &native_doc_to_html($input_filename, $output_filestem);
242	}
243	else {
244	$success = &doc_to_html($input_filename, $output_filestem);
245	}
246	if ($success) {
247	return "html";
248	}
249	}
250	return &convertAnything($input_filename, $output_filestem, $output_type);
251	}
252
253
254	# Convert a Rich Text Format (RTF) file
255
256	sub convertRTF {
257	my ($input_filename, $output_filestem, $output_type) = @_;
258
259	my $success = 0;
260
261	# Attempt specialised conversion to HTML
262	if (!$output_type \|\| ($output_type =~ m/html/i)) {
263
264	if ($windows_scripting) {
265	$success = &native_doc_to_html($input_filename, $output_filestem);
266	}
267	else {
268	$success = &rtf_to_html($input_filename, $output_filestem);
269	}
270	if ($success) {
271	return "html";
272	}
273	}
274
275	# rtf is so ugly that's it's not worth running strings over.
276	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
277	# return &convertAnything($input_filename, $output_filestem, $output_type);
278	return "fail";
279	}
280
281
282	# Convert an unidentified file
283
284	sub convertAnything {
285	my ($input_filename, $output_filestem, $output_type) = @_;
286
287	my $success = 0;
288
289	# Attempt simple conversion to HTML
290	if (!$output_type \|\| ($output_type =~ m/html/i)) {
291	$success = &any_to_html($input_filename, $output_filestem);
292	if ($success) {
293	return "html";
294	}
295	}
296
297	# Convert to text
298	if (!$output_type \|\| ($output_type =~ m/text/i)) {
299	$success = &any_to_text($input_filename, $output_filestem);
300	if ($success) {
301	return "text";
302	}
303	}
304	return "fail";
305	}
306
307
308
309	# Convert an Adobe PDF document
310
311	sub convertPDF {
312	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
313
314	my $success = 0;
315	$output_type =~ s/.\-(.)/$1/i;
316	# Attempt coversion to Image
317	if ($output_type =~ m/jp?g\|gif\|png/i) {
318	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
319	if ($success){
320	return "item";
321	}
322	}
323
324	# Attempt conversion to HTML
325	# Uses the old pdftohtml that doesn't work for newer PDF versions
326	#if ($output_type =~ m/^html/i) {
327	if (!$output_type \|\| ($output_type =~ m/^html/i)) {
328	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
329	if ($success) {
330	return "html";
331	}
332	}
333
334	# Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This
335	# will be the new default for PDFs when output_type for PDF docs is not specified
336	# (once our use of xpdftools' pdftohtml has been implemented on win and mac).
337	if ($output_type =~ m/paged_html/i) {
338	#if (!$output_type \|\| ($output_type =~ m/paged_html/i)) {
339	$success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
340	if ($success) {
341	return "paged_html";
342	}
343	}
344
345	# Attempt conversion to TEXT
346	if (!$output_type \|\| ($output_type =~ m/text/i)) {
347	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
348	if ($success) {
349	return "text";
350	}
351	}
352
353	return "fail";
354
355	}
356
357
358	# Convert an Adobe PostScript document
359
360	sub convertPS {
361	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
362
363	my $success = 0;
364	$output_type =~ s/.\-(.)/$1/i;
365	# Attempt coversion to Image
366	if ($output_type =~ m/jp?g\|gif\|png/i) {
367	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
368	if ($success){
369	return "item";
370	}
371	}
372
373	# Attempt conversion to TEXT
374	if (!$output_type \|\| ($output_type =~ m/text/i)) {
375	$success = &ps_to_text($input_filename, $output_filestem);
376	if ($success) {
377	return "text";
378	}
379	}
380	return "fail";
381	}
382
383
384	sub convertPPT {
385	my ($input_filename, $output_filestem, $output_type) = @_;
386	my $success = 0;
387
388	my $ppt_convert_type = "";
389
390	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
391	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
392	if ($output_type =~ m/gif/i) {
393	$ppt_convert_type = "-g";
394	} elsif ($output_type =~ m/jp?g/i){
395	$ppt_convert_type = "-j";
396	} elsif ($output_type =~ m/png/i){
397	$ppt_convert_type = "-p";
398	}
399	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
400	$ENV{'GSDLOS'}, "pptextract");
401	$vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
402	# $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
403
404	my $cmd = "";
405	if ($timeout) {$cmd = "ulimit -t $timeout;";}
406	# if the converting directory already exists
407	if (-d $output_filestem) {
408	print STDERR "**The conversion directory already exists\n";
409	return "item";
410	} else {
411	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
412	$cmd .= " 2>\"$output_filestem.err\""
413	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
414
415	if (system($cmd) !=0) {
416	print STDERR "Powerpoint VB Scripting convert failed\n";
417	} else {
418	return "item";
419	}
420	}
421	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
422	# Attempt conversion to HTML
423	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
424	# formulate the command
425	my $cmd = "";
426	my $full_perl_path = &util::get_perl_exec();
427	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
428	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
429	$cmd .= " 2>\"$output_filestem.err\""
430	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
431
432	# execute the command
433	$!=0;
434	if (system($cmd)!=0)
435	{
436	print STDERR "Powerpoint 95/97 converter failed $!\n";
437	} else {
438	return "html";
439	}
440	}
441
442	$success = &any_to_text($input_filename, $output_filestem);
443	if ($success) {
444	return "text";
445	}
446
447	return "fail";
448	}
449
450
451	sub convertXLS {
452	my ($input_filename, $output_filestem, $output_type) = @_;
453
454	my $success = 0;
455
456	# Attempt conversion to HTML
457	if (!$output_type \|\| ($output_type =~ m/html/i)) {
458	# formulate the command
459	my $cmd = "";
460	my $full_perl_path = &util::get_perl_exec();
461	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
462	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
463	$cmd .= " 2>\"$output_filestem.err\""
464	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
465
466
467	# execute the command
468	$!=0;
469	if (system($cmd)!=0)
470	{
471	print STDERR "Excel 95/97 converter failed $!\n";
472	} else {
473	return "html";
474	}
475	}
476
477	$success = &any_to_text($input_filename, $output_filestem);
478	if ($success) {
479	return "text";
480	}
481
482	return "fail";
483	}
484
485
486
487	# Find the real type of a .doc file
488	#
489	# We seem to have a lot of files with a .doc extension that are .rtf
490	# files or Word 5 files. This function attempts to tell the difference.
491	sub find_docfile_type {
492	my ($input_filename) = @_;
493
494	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
495	return "docx";
496	}
497
498	open(CHK, "<$input_filename");
499	binmode(CHK);
500	my $line = "";
501	my $first = 1;
502
503	while (<CHK>) {
504
505	$line = $_;
506
507	if ($first) {
508	# check to see if this is an rtf file
509	if ($line =~ m/^\{\\rtf/) {
510	close(CHK);
511	return "rtf";
512	}
513	$first = 0;
514	}
515
516	# is this is a word 6/7/8 document?
517	if ($line =~ m/Word\.Document\.([678])/) {
518	close(CHK);
519
520	return "word$1";
521	}
522
523	}
524
525	return "unknown";
526	}
527
528
529	# Specific type-to-type conversions
530	#
531	# Each of the following functions attempts to convert a document from
532	# a specific format to another. If they succeed they return 1 and leave
533	# the output document(s) in the appropriate place; if they fail they
534	# return 0 and delete any working files.
535
536
537	# Attempt to convert a word document to html with the wv program
538	sub doc_to_html {
539	my ($input_filename, $output_filestem) = @_;
540
541	my $wvware_status = 0;
542
543	# need to ensure that the path to perl is quoted (in case there's spaces in it)
544	my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
545
546	# print STDERR "***** wvware launch cmd = $launch_cmd\n";
547
548	$wvware_status = system($launch_cmd)/256;
549	return $wvware_status;
550	}
551
552	# Attempt to convert a word document to html with the word2html scripting program
553	sub native_doc_to_html {
554	my ($input_filename, $output_filestem) = @_;
555
556	# build up the path to the doc-to-html conversion tool we're going to use
557	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
558
559	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
560	# if windows scripting with docx input, use new VBscript to get the local Word install (if
561	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
562
563	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
564	# else script launch fails when there are error msgs
565	$vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
566	$vbScript = "CScript //Nologo \"$vbScript\""; # launch with CScript for error output in STDERR
567	# //Nologo flag avoids Microsoft's opening/logo msgs
568	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
569	print STDERR " This may take some time. Please wait...\n";
570	}
571	else { # old doc versions. use the usual VB executable word2html for the
572	# conversion. Doesn't need full path, since bin\windows is on PATH
573	$vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
574	}
575	}
576	else { # not windows
577	$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
578	}
579
580	if (-e "$output_filestem.html") {
581	print STDERR " The conversion file:\n";
582	print STDERR " $output_filestem.html\n";
583	print STDERR " ... already exists. Skipping\n";
584	return 1;
585	}
586
587	my $cmd = "";
588	if ($timeout) {$cmd = "ulimit -t $timeout;";}
589	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
590	#$cmd .= "$vbScript $input_filename $output_filestem.html";
591	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
592
593	# redirecting STDERR
594
595	$cmd .= " 2> \"$output_filestem.err\""
596	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
597	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
598
599	# execute the command
600	$!=0;
601	if (system($cmd)!=0)
602	{
603	print STDERR "Error executing $vbScript converter:$!\n";
604	if (-s "$output_filestem.err") {
605	open (ERRFILE, "<$output_filestem.err");
606
607	my $write_to_fail_log=0;
608	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
609	{$write_to_fail_log=1;}
610
611	my $line;
612	while ($line=<ERRFILE>) {
613	if ($line =~ m/\w/) {
614	print STDERR "$line";
615	print FAILLOG "$line" if ($write_to_fail_log);
616	}
617	if ($line !~ m/startup error/) {next;}
618	print STDERR " (given an invalid .DOC file?)\n";
619	print FAILLOG " (given an invalid .DOC file?)\n"
620	if ($write_to_fail_log);
621
622	} # while ERRFILE
623	close FAILLOG if ($write_to_fail_log);
624	}
625	return 0; # we can try any_to_text
626	}
627
628	# Was the conversion successful?
629	if (-s "$output_filestem.html") {
630	open(TMP, "$output_filestem.html");
631	my $line = <TMP>;
632	close(TMP);
633	if ($line && $line =~ m/html/i) {
634	&FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
635	return 1;
636	}
637	}
638
639	# If here, an error of some sort occurred
640	&FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
641	if (-e "$output_filestem.err") {
642	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
643	open (ERRLOG,"$output_filestem.err");
644	while (<ERRLOG>) {print FAILLOG $_;}
645	close FAILLOG;
646	close ERRLOG;
647	}
648	&FileUtils::removeFiles("$output_filestem.err");
649	}
650	return 0;
651	}
652
653	# Attempt to convert an RTF document to html with rtftohtml
654	sub rtf_to_html {
655	my ($input_filename, $output_filestem) = @_;
656
657	# formulate the command
658	my $cmd = "";
659	if ($timeout) {$cmd = "ulimit -t $timeout;";}
660	$cmd .= "rtftohtml";
661	#$cmd .= "rtf-converter";
662
663	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
664
665	$cmd .= " 2>\"$output_filestem.err\""
666	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
667
668
669	# execute the command
670	$!=0;
671	if (system($cmd)!=0)
672	{
673	print STDERR "Error executing rtf converter $!\n";
674	# don't currently bother printing out error log...
675	# keep going, in case it still created an HTML file...
676	}
677
678	# Was the conversion successful?
679	my $was_successful=0;
680	if (-s "$output_filestem.html") {
681	# make sure we have some content other than header
682	open (HTML, "$output_filestem.html"); # what to do if fail?
683	my $line;
684	my $past_header=0;
685	while ($line=<HTML>) {
686
687	if ($past_header == 0) {
688	if ($line =~ m/<body>/) {$past_header=1;}
689	next;
690	}
691
692	$line =~ s/<[^>]+>//g;
693	if ($line =~ m/\w/ && $past_header) { # we found some content...
694	$was_successful=1;
695	last;
696	}
697	}
698	close HTML;
699	}
700
701	if ($was_successful) {
702	&FileUtils::removeFiles("$output_filestem.err")
703	if (-e "$output_filestem.err");
704	# insert the (modified) table of contents, if it exists.
705	if (-e "${output_filestem}_ToC.html") {
706	&FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
707	my $open_failed=0;
708	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
709	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
710	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
711
712	if ($open_failed) {
713	close HTMLSRC;
714	close TOC;
715	close HTML;
716	&FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
717	return 1;
718	}
719
720	# print out header info from src html.
721	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
722	print HTML "$_";
723	}
724
725	# print out table of contents, making links relative
726	<TOC>; <TOC>; # ignore first 2 lines
727	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
728	my $line;
729	while ($line=<TOC>) {
730	$line =~ s@</body></html>$@@i ; # only last line has this
731	# make link relative
732	$line =~ s@href=\"[^\#]+@href=\"@i;
733	print HTML $line;
734	}
735	close TOC;
736
737	# rest of html src
738	while (<HTMLSRC>) {
739	print HTML $_;
740	}
741	close HTMLSRC;
742	close HTML;
743
744	&FileUtils::removeFiles("${output_filestem}_ToC.html");
745	&FileUtils::removeFiles("${output_filestem}.src");
746	}
747	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
748	return 1; # success
749	}
750
751	if (-e "$output_filestem.err") {
752	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
753	{
754	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
755	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
756	print FAILLOG " (rtf file might be too recent):\n";
757	open (ERRLOG, "$output_filestem.err");
758	while (<ERRLOG>) {print FAILLOG $_;}
759	close ERRLOG;
760	close FAILLOG;
761	}
762	&FileUtils::removeFiles("$output_filestem.err");
763	}
764
765	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
766
767	return 0;
768	}
769
770
771	# Convert a pdf file to html with the old pdftohtml command
772	# which only works for older PDF versions
773	sub pdf_to_html {
774	my ($dirname, $input_filename, $output_filestem) = @_;
775
776	my $cmd = "";
777	if ($timeout) {$cmd = "ulimit -t $timeout;";}
778	my $full_perl_path = &util::get_perl_exec();
779	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
780	$cmd .= " -c" if ($pdf_complex);
781	$cmd .= " -i" if ($pdf_ignore_images);
782	$cmd .= " -a" if ($pdf_allow_images_only);
783	$cmd .= " -hidden" unless ($pdf_nohidden);
784	$cmd .= " \"$input_filename\" \"$output_filestem\"";
785
786	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
787	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
788	} else {
789	$cmd .= " > \"$output_filestem.err\"";
790	}
791
792	$!=0;
793
794	my $retval=system($cmd);
795	if ($retval!=0)
796	{
797	print STDERR "Error executing pdftohtml.pl";
798	if ($!) {print STDERR ": $!";}
799	print STDERR "\n";
800	}
801
802	# make sure the converter made something
803	if ($retval!=0 \|\| ! -s "$output_filestem.html")
804	{
805	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
806	# print out the converter's std err, if any
807	if (-s "$output_filestem.err") {
808	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
809	print STDERR "pdftohtml error log:\n";
810	while (<ERRLOG>) {
811	print STDERR "$_";
812	}
813	close ERRLOG;
814	}
815	#print STDERR "***********output filestem $output_filestem.html\n";
816	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
817	if (-e "$output_filestem.err") {
818	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
819	{
820	open (ERRLOG, "$output_filestem.err");
821	while (<ERRLOG>) {print FAILLOG $_;}
822	close ERRLOG;
823	close FAILLOG;
824	}
825	&FileUtils::removeFiles("$output_filestem.err");
826	}
827	return 0;
828	}
829
830	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
831	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
832	return 1;
833	}
834
835
836	# Convert a pdf file to html with the newer Xpdftools' pdftohtml
837	# This generates "paged HTML" where extracted, selectable text is positioned
838	# over screenshots of each page.
839	# Since xpdf's pdftohtml fails if the output dir already exists and for easier
840	# naming, the output files are created in a "pages" subdirectory of the tmp
841	# location parent of $output_filestem instead
842	sub xpdf_to_html {
843	my ($dirname, $input_filename, $output_filestem) = @_;
844
845	my $cmd = "";
846
847	# build up the path to the doc-to-html conversion tool we're going to use
848	my $xpdf_pdftohtml = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools");
849
850	if($ENV{'GSDLOS'} =~ m/^windows$/i) { # For Windows, just use the 32 bit xpdf's pdftohtml as it works the same as the 64 bit
851	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32");
852	#} elsif ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
853	# TODO
854	} else { # unix (linux\|darwin), use the bin32/bin64 folder depending on the BITNESS env var
855
856	# Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since
857	# $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit
858	# specific subdirectories exist in a greenstone installation.
859	# None of those locations need exist when xpdf-tools is installed with GS.
860	# So don't depend on GSDLARCH as forcing that to be exported has side-effects
861	if($ENV{'BITNESS'}) {
862	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin".$ENV{'BITNESS'});
863	} else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64?
864	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32");
865	}
866	}
867
868	# We'll create the file by name $output_filestem during post-conversion processing.
869	# Note that Xpdf tools will only create its conversion products in a dir that does
870	# not yet exist. So we'll create this location as a subdir of the output_filestem's
871	# parent directory. The parent dir is the already generated tmp area for conversion. So:
872	# - tmpdir gs2build/tmp/<random-num> already exists at this stage
873	# - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
874	# - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
875	my ($tailname, $tmp_dirname, $suffix)
876	= &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
877	$tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
878
879	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "pdftohtml");
880	# xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
881	$cmd .= "\"$xpdf_pdftohtml\"";
882	$cmd .= " -z $pdf_zoom" if ($pdf_zoom);
883	# $cmd .= " -c" if ($pdf_complex);
884	# $cmd .= " -i" if ($pdf_ignore_images);
885	# $cmd .= " -a" if ($pdf_allow_images_only);
886	# $cmd .= " -hidden" unless ($pdf_nohidden);
887	$cmd .= " \"$input_filename\" \"$tmp_dirname\"";
888	#$cmd .= " \"$input_filename\" \"$output_filestem\"";
889
890	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
891	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
892	} else {
893	$cmd .= " > \"$output_filestem.err\"";
894	}
895
896	#print STDERR "@@@@ Running command: $cmd\n";
897
898	$!=0;
899	my $retval=system($cmd);
900	if ($retval!=0)
901	{
902	print STDERR "Error executing xpdf's pdftohtml tool";
903	if ($!) {print STDERR ": $!";}
904	print STDERR "\n";
905	}
906
907	# make sure the converter made something
908	if ($retval!=0 \|\| ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
909	{
910	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
911	# print out the converter's std err, if any
912	if (-s "$output_filestem.err") {
913	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
914	print STDERR "pdftohtml error log:\n";
915	while (<ERRLOG>) {
916	print STDERR "$_";
917	}
918	close ERRLOG;
919	}
920	#print STDERR "***********output filestem $output_filestem.html\n";
921	&FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
922	if (-e "$output_filestem.err") {
923	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
924	{
925	open (ERRLOG, "$output_filestem.err");
926	while (<ERRLOG>) {print FAILLOG $_;}
927	close ERRLOG;
928	close FAILLOG;
929	}
930	&FileUtils::removeFiles("$output_filestem.err");
931	}
932	return 0;
933	}
934
935	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
936	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
937	return 1;
938	}
939
940
941
942	# Convert a pdf file to various types of image with the convert command
943
944	sub pdfps_to_img {
945	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
946
947	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
948	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
949	my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
950	$imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
951	my $result = `$imagick_cmd identify 2>&1`;
952
953	# Linux and Windows return different values for "program not found".
954	# Linux returns -1 and Windows 256 for "program not found". But once they're
955	# converted to signed values, it will be -1 for Linux and 1 for Windows.
956	# Whenever we test for return values other than 0, shift by 8 and perform
957	# unsigned to signed status conversion on $? to get expected range of return vals
958	# Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
959	# and then exits on that, by the time we get here, we need to do it again
960	my $status = $?;
961	$status >>= 8;
962	$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
963	if (($ENV{'GSDLOS'} ne "windows" && $status == -1) \|\| ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
964	# if ($status == -1 \|\| $status == 1) #if ($status == -1 \|\| $status == 256) {
965	#ImageMagick is not installed, thus the convert utility is not available.
966	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
967	return 0;
968	}
969	}
970
971	my $cmd = "";
972	if ($timeout) {$cmd = "ulimit -t $timeout;";}
973	$output_type =~ s/.\_(.)/$1/i;
974	my $full_perl_path = &util::get_perl_exec();
975	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
976	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
977	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
978	} else {
979	$cmd .= " > \"$output_filestem.err\"";
980	}
981
982	# don't include path on windows (to avoid having to play about
983	# with quoting when GSDLHOME might contain spaces) but assume
984	# that the PATH is set up correctly
985	$!=0;
986	my $retval=system($cmd);
987	if ($retval!=0)
988	{
989	print STDERR "Error executing pdfpstoimg.pl";
990	if ($!) {print STDERR ": $!";}
991	print STDERR "\n";
992	}
993
994	#make sure the converter made something
995	#if ($retval !=0) \|\| ! -s "$output_filestem")
996	if ($retval !=0)
997	{
998	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
999	#print out the converter's std err, if any
1000	if (-s "$output_filestem.err") {
1001	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1002	print STDERR "pdfpstoimg error log:\n";
1003	while (<ERRLOG>) {
1004	print STDERR "$_";
1005	}
1006	close ERRLOG;
1007	}
1008	#&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
1009	if (-e "$output_filestem.err") {
1010	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1011	{
1012	open (ERRLOG, "$output_filestem.err");
1013	while (<ERRLOG>) {print FAILLOG $_;}
1014	close ERRLOG;
1015	close FAILLOG;
1016	}
1017	&FileUtils::removeFiles("$output_filestem.err");
1018	}
1019	return 0;
1020	}
1021	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1022	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1023	return 1;
1024	}
1025
1026	# Convert a PDF file to text with the pdftotext command
1027
1028	sub pdf_to_text {
1029	my ($dirname, $input_filename, $output_filestem) = @_;
1030
1031	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1032
1033	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1034	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1035	} else {
1036	$cmd .= " > \"$output_filestem.err\"";
1037	}
1038
1039	if (system($cmd)!=0)
1040	{
1041	print STDERR "Error executing $cmd: $!\n";
1042	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1043	}
1044
1045	# make sure there is some extracted text.
1046	if (-e "$output_filestem.text") {
1047	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
1048	binmode(EXTR_TEXT); # just in case...
1049	my $line="";
1050	my $seen_text=0;
1051	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1052	if ($line=~ m/\w/) {$seen_text=1;}
1053	}
1054	close EXTR_TEXT;
1055	if ($seen_text==0) { # no text was extracted
1056	print STDERR "Error: pdftotext found no text\n";
1057	&FileUtils::removeFiles("$output_filestem.text");
1058	}
1059	}
1060
1061	# make sure the converter made something
1062	if (! -s "$output_filestem.text")
1063	{
1064	# print out the converters std err, if any
1065	if (-s "$output_filestem.err") {
1066	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1067	print STDERR "pdftotext error log:\n";
1068	while (<ERRLOG>) {
1069	print STDERR "$_";
1070	}
1071	close ERRLOG;
1072	}
1073	# does this converter create a .out file?
1074	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1075	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1076	if (-e "$output_filestem.err") {
1077	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1078	{
1079	open (ERRLOG,"$output_filestem.err");
1080	while (<ERRLOG>) {print FAILLOG $_;}
1081	close ERRLOG;
1082	close FAILLOG;
1083	}
1084	&FileUtils::removeFiles("$output_filestem.err");
1085	}
1086	return 0;
1087	}
1088	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1089	return 1;
1090	}
1091
1092	# Convert a PostScript document to text
1093	# note - just using "ps2ascii" isn't good enough, as it
1094	# returns 0 for a postscript interpreter error. ps2ascii is just
1095	# a wrapper to "gs" anyway, so we use that cmd here.
1096
1097	sub ps_to_text {
1098	my ($input_filename, $output_filestem) = @_;
1099
1100	my $error = "";
1101
1102	# if we're on windows we'll fall straight through without attempting
1103	# to use gs
1104	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1105	$error = "Windows does not support gs";
1106
1107	} else {
1108	my $cmd = "";
1109	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1110	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1111	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1112	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1113	$cmd .= " 2> $output_filestem.err";
1114	$!=0;
1115
1116	my $retcode=system($cmd);
1117	$retcode = $? >> 8; # see man perlfunc - system for this...
1118	# if system returns -1 \| 127 (couldn't start program), look at $! for message
1119
1120	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1121	elsif (! -e "$output_filestem.text") {
1122	$error="did not create output file.\n";
1123	}
1124	else
1125	{ # make sure the interpreter didn't get an error. It is technically
1126	# possible for the actual text to start with this, but....
1127	open PSOUT, "$output_filestem.text";
1128	if (<PSOUT> =~ m/^Error: (.*)/) {
1129	$error="interpreter error - \"$1\"";
1130	}
1131	close PSOUT;
1132	}
1133	}
1134
1135	if ($error ne "")
1136	{
1137	print STDERR "Warning: Error executing gs: $error\n";
1138	print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
1139	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1140
1141	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1142	{
1143	print FAILLOG "gs - $error\n";
1144	if (-e "$output_filestem.err") {
1145	open(ERRLOG, "$output_filestem.err");
1146	while (<ERRLOG>) {print FAILLOG $_;}
1147	close ERRLOG;
1148	}
1149	close FAILLOG;
1150	}
1151	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1152
1153
1154	# Fine then. We'll just do a lousy job by ourselves...
1155	# Based on 5-line regexp sed script found at:
1156	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1157	#
1158	print STDERR "Stripping text from postscript\n";
1159	my $errorcode=0;
1160	open (IN, "$input_filename")
1161	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1162	open (OUT, ">$output_filestem.text")
1163	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1164	if ($errorcode) {print STDERR "errors\n";return 0;}
1165
1166	my $text=""; # this is for whole .ps file...
1167	$text = join('', <IN>); # see man perlport, under "System Resources"
1168	close IN;
1169
1170	# Make sure this is a ps file...
1171	if ($text !~ m/^%!/) {
1172	print STDERR "Bad postscript header: not '%!'\n";
1173	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1174	{
1175	print FAILLOG "Bad postscript header: not '%!'\n";
1176	close FAILLOG;
1177	}
1178	return 0;
1179	}
1180
1181	# if ps has Page data, then use it to delete all stuff before it.
1182	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1183
1184	# remove all leading non-data stuff
1185	$text =~ s/^.*?\(//s;
1186
1187	# remove all newline chars for easier processing
1188	$text =~ s/\n//g;
1189
1190	# Big assumption here - assume that if any co-ordinates are
1191	# given, then we are at the end of a sentence.
1192	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1193
1194	# special characters--
1195	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1196
1197	# ? ps text formatting (eg italics?) ?
1198	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1199	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1200	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1201	# default - remove the rest
1202	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1203
1204	# attempt to add whitespace between words...
1205	# this is based purely on observation, and may be completely wrong...
1206	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1207	# eg I notice "b(" is sometimes NOT a space if preceded by a
1208	# negative number.
1209	$text =~ s/\)\d+ ?b\(/\) \( /g;
1210
1211	# change quoted braces to brackets
1212	$text =~ s/([^\\])\\\(/$1\{/g;
1213	$text =~ s/([^\\])\\\)/$1\}/g ;
1214
1215	# remove everything that is not between braces
1216	$text =~ s/\)([^\(\)])+?\(//sg ;
1217
1218	# remove any Trailer eof stuff.
1219	$text =~ s/\)[^\)]*$//sg;
1220
1221	### ligatures have special characters...
1222	$text =~ s/\\013/ff/g;
1223	$text =~ s/\\014/fi/g;
1224	$text =~ s/\\015/fl/g;
1225	$text =~ s/\\016/ffi/g;
1226	$text =~ s/\\214/fi/g;
1227	$text =~ s/\\215/fl/g;
1228	$text =~ s/\\017/\n\* /g; # asterisk?
1229	$text =~ s/\\023/\023/g; # e acute ('e)
1230	$text =~ s/\\177/\252/g; # u"
1231	# $text =~ s/ ?? /\344/g; # a"
1232
1233	print OUT "$text";
1234	close OUT;
1235	}
1236	# wrap the text - use a minimum length. ie, first space after this length.
1237	my $wrap_length=72;
1238	&FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1239	open INFILE, "$output_filestem.text.tmp" \|\|
1240	die "Couldn't open file: $!";
1241	open OUTFILE, ">$output_filestem.text" \|\|
1242	die "Couldn't open file for writing: $!";
1243	my $line="";
1244	while ($line=<INFILE>) {
1245	while (length($line)>0) {
1246	if (length($line)>$wrap_length) {
1247	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1248	print OUTFILE "$1\n";
1249	} else {
1250	print OUTFILE "$line";
1251	$line="";
1252	}
1253	}
1254	}
1255	close INFILE;
1256	close OUTFILE;
1257	&FileUtils::removeFiles("$output_filestem.text.tmp");
1258
1259	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1260	return 1;
1261	}
1262
1263
1264	# Convert any file to HTML with a crude perl implementation of the
1265	# UNIX strings command.
1266
1267	sub any_to_html {
1268	my ($input_filename, $output_filestem) = @_;
1269
1270	# First generate a text file
1271	return 0 unless (&any_to_text($input_filename, $output_filestem));
1272
1273	# create an HTML file from the text file
1274	open(TEXT, "<$output_filestem.text");
1275	open(HTML, ">$output_filestem.html");
1276
1277	print HTML "<html><head>\n";
1278	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1279	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1280	print HTML "</head><body>\n\n";
1281
1282	my $line;
1283	while ($line=<TEXT>) {
1284	$line =~ s/</</g;
1285	$line =~ s/>/>/g;
1286	if ($line =~ m/^\s*$/) {
1287	print HTML "<p>";
1288	} else {
1289	print HTML "<br> ", $line;
1290	}
1291	}
1292	print HTML "\n</body></html>\n";
1293
1294	close HTML;
1295	close TEXT;
1296
1297	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1298	return 1;
1299	}
1300
1301	# Convert any file to TEXT with a crude perl implementation of the
1302	# UNIX strings command.
1303	# Note - this assumes ascii charsets :( (jrm21)
1304
1305	sub any_to_text {
1306	my ($input_filename, $output_filestem) = @_;
1307
1308	if (!$use_strings) {
1309	return 0;
1310	}
1311
1312	print STDERR "\n** In any to text**\n\n";
1313	open(IN, "<$input_filename") \|\| return 0;
1314	binmode(IN);
1315	open(OUT, ">$output_filestem.text") \|\| return 0;
1316
1317	my ($line);
1318	my $output_line_count = 0;
1319	while (<IN>) {
1320	$line = $_;
1321
1322	# delete anything that isn't a printable character
1323	$line =~ s/[^\040-\176]+/\n/sg;
1324
1325	# delete any string less than 10 characters long
1326	$line =~ s/^.{0,9}$/\n/mg;
1327	while ($line =~ m/^.{1,9}$/m) {
1328	$line =~ s/^.{0,9}$/\n/mg;
1329	$line =~ s/\n+/\n/sg;
1330	}
1331
1332	# remove extraneous whitespace
1333	$line =~ s/\n+/\n/gs;
1334	$line =~ s/^\n//gs;
1335
1336	# output whatever is left
1337	if ($line =~ m/[^\n ]/) {
1338	print OUT $line;
1339	++$output_line_count;
1340	}
1341	}
1342
1343	close OUT;
1344	close IN;
1345
1346	if ($output_line_count) { # try to protect against binary only formats
1347	return 1;
1348	}
1349
1350	&FileUtils::removeFiles("$output_filestem.text");
1351	return 0;
1352
1353	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: