Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 32207

Last change on this file since 32207 was 32207, checked in by ak19, 6 years ago
Got a basic Windows version of PDFPlugin's new paged_html mode working
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 41.8 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use strict;
52
53	use parsargv;
54	use util;
55	use FileUtils;
56	use Cwd;
57
58	# Are we running on WinNT or Win2000 (or later)?
59	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62	my $use_strings;
63	my $pdf_complex;
64	my $pdf_nohidden;
65	my $pdf_zoom;
66	my $pdf_ignore_images;
67	my $pdf_allow_images_only;
68	my $windows_scripting;
69
70	sub print_usage
71	{
72	print STDERR "\n";
73	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
74	print STDERR " or text using third-party programs.\n\n";
75	print STDERR " usage: $0 [options] filename\n";
76	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
77	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
78	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
79	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
80	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
81	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
82	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
83	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
84	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
85	print STDERR "\t\tconverting PDF to HTML\n";
86	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
87	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
88	print STDERR "\t\t-pdf_complex is set\n";
89	exit(1);
90	}
91
92	my $faillogfile="";
93	my $timeout=0;
94	my $verbosity=0;
95
96	sub main
97	{
98	my (@ARGV) = @_;
99	my ($input_type,$output_type,$verbose);
100
101	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
102	# is in use or not
103	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
104	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
105	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
106	# Currently only have VBA for Word and PPT(but no XLS)
107	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
108
109	my $type_re = $default_type_re;
110
111	foreach my $a (@ARGV) {
112	if ($a =~ m/^windows_scripting$/i) {
113	$type_re = $enhanced_type_re;
114	}
115	}
116
117	# read command-line arguments
118	if (!parsargv::parse(\@ARGV,
119	"type/$type_re/", \$input_type,
120	'/errlog/.*/', \$faillogfile,
121	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
122	'timeout/\d+/0',\$timeout,
123	'verbose/\d+/0', \$verbose,
124	'windows_scripting',\$windows_scripting,
125	'use_strings', \$use_strings,
126	'pdf_complex', \$pdf_complex,
127	'pdf_ignore_images', \$pdf_ignore_images,
128	'pdf_allow_images_only', \$pdf_allow_images_only,
129	'pdf_nohidden', \$pdf_nohidden,
130	'pdf_zoom/\d+/2', \$pdf_zoom
131	))
132	{
133	print_usage();
134	}
135
136	$verbosity=$verbose if defined $verbose;
137
138	# Make sure the input file exists and can be opened for reading
139	if (scalar(@ARGV!=1)) {
140	print_usage();
141	}
142
143	my $input_filename = $ARGV[0];
144	if (!-r $input_filename) {
145	print STDERR "Error: unable to open $input_filename for reading\n";
146	exit(1);
147	}
148
149	# Deduce filenames
150	my ($tailname,$dirname,$suffix)
151	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
152	my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
153
154	if ($input_type eq "")
155	{
156	$input_type = lc (substr($suffix,1,length($suffix)-1));
157	}
158
159	# Change to temporary working directory
160	my $stored_dir = cwd();
161	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
162
163	# Select convert utility
164	if (!defined $input_type) {
165	print STDERR "Error: No filename extension or input type defined\n";
166	exit(1);
167	}
168	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
169	print &convertDOC($input_filename, $output_filestem, $output_type);
170	print "\n";
171	}
172	elsif ($input_type eq "rtf") {
173	print &convertRTF($input_filename, $output_filestem, $output_type);
174	print "\n";
175	}
176	elsif ($input_type eq "pdf") {
177	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
178	print "\n";
179	}
180	elsif ($input_type eq "ps") {
181	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
182	print "\n";
183	}
184	elsif ($input_type =~ m/pptx?$/) {
185	print &convertPPT($input_filename, $output_filestem, $output_type);
186	print "\n";
187	}
188	elsif ($input_type =~ m/xlsx?$/) {
189	print &convertXLS($input_filename, $output_filestem, $output_type);
190	print "\n";
191	}
192	else {
193	print STDERR "Error: Unable to convert type '$input_type'\n";
194	exit(1);
195	}
196
197	# restore to original working directory
198	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
199
200	}
201
202	&main(@ARGV);
203
204
205
206	# Document-type conversion functions
207	#
208	# The following functions attempt to convert documents from their
209	# input type to the specified output type. If no output type was
210	# given, then they first attempt HTML, and then TEXT.
211	#
212	# Each returns the output type ("html" or "text") or "fail" if no
213	# conversion is possible.
214
215	# Convert a Microsoft word document
216
217	sub convertDOC {
218	my ($input_filename, $output_filestem, $output_type) = @_;
219
220	# Many .doc files are not in fact word documents!
221	my $realtype = &find_docfile_type($input_filename);
222
223	if ($realtype eq "word6" \|\| $realtype eq "word7"
224	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
225	return &convertWord678($input_filename, $output_filestem, $output_type);
226	} elsif ($realtype eq "rtf") {
227	return &convertRTF($input_filename, $output_filestem, $output_type);
228	} else {
229	return &convertAnything($input_filename, $output_filestem, $output_type);
230	}
231	}
232
233	# Convert a Microsoft word 6/7/8 document
234
235	sub convertWord678 {
236	my ($input_filename, $output_filestem, $output_type) = @_;
237
238	my $success = 0;
239	if (!$output_type \|\| ($output_type =~ m/html/i)){
240	if ($windows_scripting) {
241	$success = &native_doc_to_html($input_filename, $output_filestem);
242	}
243	else {
244	$success = &doc_to_html($input_filename, $output_filestem);
245	}
246	if ($success) {
247	return "html";
248	}
249	}
250	return &convertAnything($input_filename, $output_filestem, $output_type);
251	}
252
253
254	# Convert a Rich Text Format (RTF) file
255
256	sub convertRTF {
257	my ($input_filename, $output_filestem, $output_type) = @_;
258
259	my $success = 0;
260
261	# Attempt specialised conversion to HTML
262	if (!$output_type \|\| ($output_type =~ m/html/i)) {
263
264	if ($windows_scripting) {
265	$success = &native_doc_to_html($input_filename, $output_filestem);
266	}
267	else {
268	$success = &rtf_to_html($input_filename, $output_filestem);
269	}
270	if ($success) {
271	return "html";
272	}
273	}
274
275	# rtf is so ugly that's it's not worth running strings over.
276	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
277	# return &convertAnything($input_filename, $output_filestem, $output_type);
278	return "fail";
279	}
280
281
282	# Convert an unidentified file
283
284	sub convertAnything {
285	my ($input_filename, $output_filestem, $output_type) = @_;
286
287	my $success = 0;
288
289	# Attempt simple conversion to HTML
290	if (!$output_type \|\| ($output_type =~ m/html/i)) {
291	$success = &any_to_html($input_filename, $output_filestem);
292	if ($success) {
293	return "html";
294	}
295	}
296
297	# Convert to text
298	if (!$output_type \|\| ($output_type =~ m/text/i)) {
299	$success = &any_to_text($input_filename, $output_filestem);
300	if ($success) {
301	return "text";
302	}
303	}
304	return "fail";
305	}
306
307
308
309	# Convert an Adobe PDF document
310
311	sub convertPDF {
312	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
313
314	my $success = 0;
315	$output_type =~ s/.\-(.)/$1/i;
316	# Attempt coversion to Image
317	if ($output_type =~ m/jp?g\|gif\|png/i) {
318	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
319	if ($success){
320	return "item";
321	}
322	}
323
324	# Attempt conversion to HTML
325	# Uses the old pdftohtml that doesn't work for newer PDF versions
326	#if ($output_type =~ m/^html/i) {
327	if (!$output_type \|\| ($output_type =~ m/^html/i)) {
328	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
329	if ($success) {
330	return "html";
331	}
332	}
333
334	# Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This
335	# will be the new default for PDFs when output_type for PDF docs is not specified
336	# (once our use of xpdftools' pdftohtml has been implemented on win and mac).
337	if ($output_type =~ m/paged_html/i) {
338	#if (!$output_type \|\| ($output_type =~ m/paged_html/i)) {
339	$success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
340	if ($success) {
341	return "paged_html";
342	}
343	}
344
345	# Attempt conversion to TEXT
346	if (!$output_type \|\| ($output_type =~ m/text/i)) {
347	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
348	if ($success) {
349	return "text";
350	}
351	}
352
353	return "fail";
354
355	}
356
357
358	# Convert an Adobe PostScript document
359
360	sub convertPS {
361	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
362
363	my $success = 0;
364	$output_type =~ s/.\-(.)/$1/i;
365	# Attempt coversion to Image
366	if ($output_type =~ m/jp?g\|gif\|png/i) {
367	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
368	if ($success){
369	return "item";
370	}
371	}
372
373	# Attempt conversion to TEXT
374	if (!$output_type \|\| ($output_type =~ m/text/i)) {
375	$success = &ps_to_text($input_filename, $output_filestem);
376	if ($success) {
377	return "text";
378	}
379	}
380	return "fail";
381	}
382
383
384	sub convertPPT {
385	my ($input_filename, $output_filestem, $output_type) = @_;
386	my $success = 0;
387
388	my $ppt_convert_type = "";
389
390	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
391	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
392	if ($output_type =~ m/gif/i) {
393	$ppt_convert_type = "-g";
394	} elsif ($output_type =~ m/jp?g/i){
395	$ppt_convert_type = "-j";
396	} elsif ($output_type =~ m/png/i){
397	$ppt_convert_type = "-p";
398	}
399	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
400	$ENV{'GSDLOS'}, "pptextract");
401	$vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
402	# $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
403
404	my $cmd = "";
405	if ($timeout) {$cmd = "ulimit -t $timeout;";}
406	# if the converting directory already exists
407	if (-d $output_filestem) {
408	print STDERR "**The conversion directory already exists\n";
409	return "item";
410	} else {
411	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
412	$cmd .= " 2>\"$output_filestem.err\""
413	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
414
415	if (system($cmd) !=0) {
416	print STDERR "Powerpoint VB Scripting convert failed\n";
417	} else {
418	return "item";
419	}
420	}
421	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
422	# Attempt conversion to HTML
423	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
424	# formulate the command
425	my $cmd = "";
426	my $full_perl_path = &util::get_perl_exec();
427	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
428	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
429	$cmd .= " 2>\"$output_filestem.err\""
430	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
431
432	# execute the command
433	$!=0;
434	if (system($cmd)!=0)
435	{
436	print STDERR "Powerpoint 95/97 converter failed $!\n";
437	} else {
438	return "html";
439	}
440	}
441
442	$success = &any_to_text($input_filename, $output_filestem);
443	if ($success) {
444	return "text";
445	}
446
447	return "fail";
448	}
449
450
451	sub convertXLS {
452	my ($input_filename, $output_filestem, $output_type) = @_;
453
454	my $success = 0;
455
456	# Attempt conversion to HTML
457	if (!$output_type \|\| ($output_type =~ m/html/i)) {
458	# formulate the command
459	my $cmd = "";
460	my $full_perl_path = &util::get_perl_exec();
461	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
462	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
463	$cmd .= " 2>\"$output_filestem.err\""
464	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
465
466
467	# execute the command
468	$!=0;
469	if (system($cmd)!=0)
470	{
471	print STDERR "Excel 95/97 converter failed $!\n";
472	} else {
473	return "html";
474	}
475	}
476
477	$success = &any_to_text($input_filename, $output_filestem);
478	if ($success) {
479	return "text";
480	}
481
482	return "fail";
483	}
484
485
486
487	# Find the real type of a .doc file
488	#
489	# We seem to have a lot of files with a .doc extension that are .rtf
490	# files or Word 5 files. This function attempts to tell the difference.
491	sub find_docfile_type {
492	my ($input_filename) = @_;
493
494	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
495	return "docx";
496	}
497
498	open(CHK, "<$input_filename");
499	binmode(CHK);
500	my $line = "";
501	my $first = 1;
502
503	while (<CHK>) {
504
505	$line = $_;
506
507	if ($first) {
508	# check to see if this is an rtf file
509	if ($line =~ m/^\{\\rtf/) {
510	close(CHK);
511	return "rtf";
512	}
513	$first = 0;
514	}
515
516	# is this is a word 6/7/8 document?
517	if ($line =~ m/Word\.Document\.([678])/) {
518	close(CHK);
519
520	return "word$1";
521	}
522
523	}
524
525	return "unknown";
526	}
527
528
529	# Specific type-to-type conversions
530	#
531	# Each of the following functions attempts to convert a document from
532	# a specific format to another. If they succeed they return 1 and leave
533	# the output document(s) in the appropriate place; if they fail they
534	# return 0 and delete any working files.
535
536
537	# Attempt to convert a word document to html with the wv program
538	sub doc_to_html {
539	my ($input_filename, $output_filestem) = @_;
540
541	my $wvware_status = 0;
542
543	# need to ensure that the path to perl is quoted (in case there's spaces in it)
544	my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
545
546	# print STDERR "***** wvware launch cmd = $launch_cmd\n";
547
548	$wvware_status = system($launch_cmd)/256;
549	return $wvware_status;
550	}
551
552	# Attempt to convert a word document to html with the word2html scripting program
553	sub native_doc_to_html {
554	my ($input_filename, $output_filestem) = @_;
555
556	# build up the path to the doc-to-html conversion tool we're going to use
557	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
558
559	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
560	# if windows scripting with docx input, use new VBscript to get the local Word install (if
561	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
562
563	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
564	# else script launch fails when there are error msgs
565	$vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
566	$vbScript = "CScript //Nologo \"$vbScript\""; # launch with CScript for error output in STDERR
567	# //Nologo flag avoids Microsoft's opening/logo msgs
568	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
569	print STDERR " This may take some time. Please wait...\n";
570	}
571	else { # old doc versions. use the usual VB executable word2html for the
572	# conversion. Doesn't need full path, since bin\windows is on PATH
573	$vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
574	}
575	}
576	else { # not windows
577	$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
578	}
579
580	if (-e "$output_filestem.html") {
581	print STDERR " The conversion file:\n";
582	print STDERR " $output_filestem.html\n";
583	print STDERR " ... already exists. Skipping\n";
584	return 1;
585	}
586
587	my $cmd = "";
588	if ($timeout) {$cmd = "ulimit -t $timeout;";}
589	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
590	#$cmd .= "$vbScript $input_filename $output_filestem.html";
591	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
592
593	# redirecting STDERR
594
595	$cmd .= " 2> \"$output_filestem.err\""
596	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
597	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
598
599	# execute the command
600	$!=0;
601	if (system($cmd)!=0)
602	{
603	print STDERR "Error executing $vbScript converter:$!\n";
604	if (-s "$output_filestem.err") {
605	open (ERRFILE, "<$output_filestem.err");
606
607	my $write_to_fail_log=0;
608	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
609	{$write_to_fail_log=1;}
610
611	my $line;
612	while ($line=<ERRFILE>) {
613	if ($line =~ m/\w/) {
614	print STDERR "$line";
615	print FAILLOG "$line" if ($write_to_fail_log);
616	}
617	if ($line !~ m/startup error/) {next;}
618	print STDERR " (given an invalid .DOC file?)\n";
619	print FAILLOG " (given an invalid .DOC file?)\n"
620	if ($write_to_fail_log);
621
622	} # while ERRFILE
623	close FAILLOG if ($write_to_fail_log);
624	}
625	return 0; # we can try any_to_text
626	}
627
628	# Was the conversion successful?
629	if (-s "$output_filestem.html") {
630	open(TMP, "$output_filestem.html");
631	my $line = <TMP>;
632	close(TMP);
633	if ($line && $line =~ m/html/i) {
634	&FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
635	return 1;
636	}
637	}
638
639	# If here, an error of some sort occurred
640	&FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
641	if (-e "$output_filestem.err") {
642	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
643	open (ERRLOG,"$output_filestem.err");
644	while (<ERRLOG>) {print FAILLOG $_;}
645	close FAILLOG;
646	close ERRLOG;
647	}
648	&FileUtils::removeFiles("$output_filestem.err");
649	}
650	return 0;
651	}
652
653	# Attempt to convert an RTF document to html with rtftohtml
654	sub rtf_to_html {
655	my ($input_filename, $output_filestem) = @_;
656
657	# formulate the command
658	my $cmd = "";
659	if ($timeout) {$cmd = "ulimit -t $timeout;";}
660	$cmd .= "rtftohtml";
661	#$cmd .= "rtf-converter";
662
663	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
664
665	$cmd .= " 2>\"$output_filestem.err\""
666	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
667
668
669	# execute the command
670	$!=0;
671	if (system($cmd)!=0)
672	{
673	print STDERR "Error executing rtf converter $!\n";
674	# don't currently bother printing out error log...
675	# keep going, in case it still created an HTML file...
676	}
677
678	# Was the conversion successful?
679	my $was_successful=0;
680	if (-s "$output_filestem.html") {
681	# make sure we have some content other than header
682	open (HTML, "$output_filestem.html"); # what to do if fail?
683	my $line;
684	my $past_header=0;
685	while ($line=<HTML>) {
686
687	if ($past_header == 0) {
688	if ($line =~ m/<body>/) {$past_header=1;}
689	next;
690	}
691
692	$line =~ s/<[^>]+>//g;
693	if ($line =~ m/\w/ && $past_header) { # we found some content...
694	$was_successful=1;
695	last;
696	}
697	}
698	close HTML;
699	}
700
701	if ($was_successful) {
702	&FileUtils::removeFiles("$output_filestem.err")
703	if (-e "$output_filestem.err");
704	# insert the (modified) table of contents, if it exists.
705	if (-e "${output_filestem}_ToC.html") {
706	&FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
707	my $open_failed=0;
708	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
709	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
710	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
711
712	if ($open_failed) {
713	close HTMLSRC;
714	close TOC;
715	close HTML;
716	&FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
717	return 1;
718	}
719
720	# print out header info from src html.
721	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
722	print HTML "$_";
723	}
724
725	# print out table of contents, making links relative
726	<TOC>; <TOC>; # ignore first 2 lines
727	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
728	my $line;
729	while ($line=<TOC>) {
730	$line =~ s@</body></html>$@@i ; # only last line has this
731	# make link relative
732	$line =~ s@href=\"[^\#]+@href=\"@i;
733	print HTML $line;
734	}
735	close TOC;
736
737	# rest of html src
738	while (<HTMLSRC>) {
739	print HTML $_;
740	}
741	close HTMLSRC;
742	close HTML;
743
744	&FileUtils::removeFiles("${output_filestem}_ToC.html");
745	&FileUtils::removeFiles("${output_filestem}.src");
746	}
747	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
748	return 1; # success
749	}
750
751	if (-e "$output_filestem.err") {
752	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
753	{
754	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
755	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
756	print FAILLOG " (rtf file might be too recent):\n";
757	open (ERRLOG, "$output_filestem.err");
758	while (<ERRLOG>) {print FAILLOG $_;}
759	close ERRLOG;
760	close FAILLOG;
761	}
762	&FileUtils::removeFiles("$output_filestem.err");
763	}
764
765	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
766
767	return 0;
768	}
769
770
771	# Convert a pdf file to html with the old pdftohtml command
772	# which only works for older PDF versions
773	sub pdf_to_html {
774	my ($dirname, $input_filename, $output_filestem) = @_;
775
776	my $cmd = "";
777	if ($timeout) {$cmd = "ulimit -t $timeout;";}
778	my $full_perl_path = &util::get_perl_exec();
779	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
780	$cmd .= " -c" if ($pdf_complex);
781	$cmd .= " -i" if ($pdf_ignore_images);
782	$cmd .= " -a" if ($pdf_allow_images_only);
783	$cmd .= " -hidden" unless ($pdf_nohidden);
784	$cmd .= " \"$input_filename\" \"$output_filestem\"";
785
786	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
787	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
788	} else {
789	$cmd .= " > \"$output_filestem.err\"";
790	}
791
792	$!=0;
793
794	my $retval=system($cmd);
795	if ($retval!=0)
796	{
797	print STDERR "Error executing pdftohtml.pl";
798	if ($!) {print STDERR ": $!";}
799	print STDERR "\n";
800	}
801
802	# make sure the converter made something
803	if ($retval!=0 \|\| ! -s "$output_filestem.html")
804	{
805	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
806	# print out the converter's std err, if any
807	if (-s "$output_filestem.err") {
808	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
809	print STDERR "pdftohtml error log:\n";
810	while (<ERRLOG>) {
811	print STDERR "$_";
812	}
813	close ERRLOG;
814	}
815	#print STDERR "***********output filestem $output_filestem.html\n";
816	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
817	if (-e "$output_filestem.err") {
818	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
819	{
820	open (ERRLOG, "$output_filestem.err");
821	while (<ERRLOG>) {print FAILLOG $_;}
822	close ERRLOG;
823	close FAILLOG;
824	}
825	&FileUtils::removeFiles("$output_filestem.err");
826	}
827	return 0;
828	}
829
830	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
831	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
832	return 1;
833	}
834
835
836	# Convert a pdf file to html with the newer Xpdftools' pdftohtml
837	# This generates "paged HTML" where extracted, selectable text is positioned
838	# over screenshots of each page.
839	# Since xpdf's pdftohtml fails if the output dir already exists and for easier
840	# naming, the output files are created in a "pages" subdirectory of the tmp
841	# location parent of $output_filestem instead
842	sub xpdf_to_html {
843	my ($dirname, $input_filename, $output_filestem) = @_;
844
845	my $cmd = "";
846
847	# build up the path to the doc-to-html conversion tool we're going to use
848	my $xpdf_pdftohtml = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools");
849
850	if ($ENV{'GSDLOS'} =~ m/^darwin$/i) {
851	# TODO
852	} else { # unix or windows, use the appropriate bin folder for the bitness of the system
853	# In fact, when testing 3 different PDF docs, it doesn't seem to make a difference on
854	# 64 bit Windows whether the pdftohtml binary in the bin32 or bin64 folder is used.
855	# However, maybe we'll use another xpdf-tool too in future where bitness will be relevant.
856
857	# Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since
858	# $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit
859	# specific subdirectories exist in a greenstone installation.
860	# None of those locations need exist when xpdf-tools is installed with GS.
861	# So don't depend on GSDLARCH as forcing that to be exported has side-effects
862	if($ENV{'BITNESS'}) {
863	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin".$ENV{'BITNESS'});
864	} else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64?
865	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32");
866	}
867	}
868
869	#print STDERR "@@@@ BITNESS: " . $ENV{'BITNESS'} . "\n";
870
871	# We'll create the file by name $output_filestem during post-conversion processing.
872	# Note that Xpdf tools will only create its conversion products in a dir that does
873	# not yet exist. So we'll create this location as a subdir of the output_filestem's
874	# parent directory. The parent dir is the already generated tmp area for conversion. So:
875	# - tmpdir gs2build/tmp/<random-num> already exists at this stage
876	# - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
877	# - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
878	my ($tailname, $tmp_dirname, $suffix)
879	= &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
880	$tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
881
882	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "pdftohtml");
883	# xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
884	$cmd .= "\"$xpdf_pdftohtml\"";
885	$cmd .= " -z $pdf_zoom" if ($pdf_zoom);
886	# $cmd .= " -c" if ($pdf_complex);
887	# $cmd .= " -i" if ($pdf_ignore_images);
888	# $cmd .= " -a" if ($pdf_allow_images_only);
889	# $cmd .= " -hidden" unless ($pdf_nohidden);
890	$cmd .= " \"$input_filename\" \"$tmp_dirname\"";
891	#$cmd .= " \"$input_filename\" \"$output_filestem\"";
892
893	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
894	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
895	} else {
896	$cmd .= " > \"$output_filestem.err\"";
897	}
898
899	#print STDERR "@@@@ Running command: $cmd\n";
900
901	$!=0;
902	my $retval=system($cmd);
903	if ($retval!=0)
904	{
905	print STDERR "Error executing xpdf's pdftohtml tool";
906	if ($!) {print STDERR ": $!";}
907	print STDERR "\n";
908	}
909
910	# make sure the converter made something
911	if ($retval!=0 \|\| ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
912	{
913	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
914	# print out the converter's std err, if any
915	if (-s "$output_filestem.err") {
916	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
917	print STDERR "pdftohtml error log:\n";
918	while (<ERRLOG>) {
919	print STDERR "$_";
920	}
921	close ERRLOG;
922	}
923	#print STDERR "***********output filestem $output_filestem.html\n";
924	&FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
925	if (-e "$output_filestem.err") {
926	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
927	{
928	open (ERRLOG, "$output_filestem.err");
929	while (<ERRLOG>) {print FAILLOG $_;}
930	close ERRLOG;
931	close FAILLOG;
932	}
933	&FileUtils::removeFiles("$output_filestem.err");
934	}
935	return 0;
936	}
937
938	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
939	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
940	return 1;
941	}
942
943
944
945	# Convert a pdf file to various types of image with the convert command
946
947	sub pdfps_to_img {
948	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
949
950	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
951	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
952	my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
953	$imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
954	my $result = `$imagick_cmd identify 2>&1`;
955
956	# Linux and Windows return different values for "program not found".
957	# Linux returns -1 and Windows 256 for "program not found". But once they're
958	# converted to signed values, it will be -1 for Linux and 1 for Windows.
959	# Whenever we test for return values other than 0, shift by 8 and perform
960	# unsigned to signed status conversion on $? to get expected range of return vals
961	# Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
962	# and then exits on that, by the time we get here, we need to do it again
963	my $status = $?;
964	$status >>= 8;
965	$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
966	if (($ENV{'GSDLOS'} ne "windows" && $status == -1) \|\| ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
967	# if ($status == -1 \|\| $status == 1) #if ($status == -1 \|\| $status == 256) {
968	#ImageMagick is not installed, thus the convert utility is not available.
969	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
970	return 0;
971	}
972	}
973
974	my $cmd = "";
975	if ($timeout) {$cmd = "ulimit -t $timeout;";}
976	$output_type =~ s/.\_(.)/$1/i;
977	my $full_perl_path = &util::get_perl_exec();
978	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
979	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
980	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
981	} else {
982	$cmd .= " > \"$output_filestem.err\"";
983	}
984
985	# don't include path on windows (to avoid having to play about
986	# with quoting when GSDLHOME might contain spaces) but assume
987	# that the PATH is set up correctly
988	$!=0;
989	my $retval=system($cmd);
990	if ($retval!=0)
991	{
992	print STDERR "Error executing pdfpstoimg.pl";
993	if ($!) {print STDERR ": $!";}
994	print STDERR "\n";
995	}
996
997	#make sure the converter made something
998	#if ($retval !=0) \|\| ! -s "$output_filestem")
999	if ($retval !=0)
1000	{
1001	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1002	#print out the converter's std err, if any
1003	if (-s "$output_filestem.err") {
1004	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1005	print STDERR "pdfpstoimg error log:\n";
1006	while (<ERRLOG>) {
1007	print STDERR "$_";
1008	}
1009	close ERRLOG;
1010	}
1011	#&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
1012	if (-e "$output_filestem.err") {
1013	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1014	{
1015	open (ERRLOG, "$output_filestem.err");
1016	while (<ERRLOG>) {print FAILLOG $_;}
1017	close ERRLOG;
1018	close FAILLOG;
1019	}
1020	&FileUtils::removeFiles("$output_filestem.err");
1021	}
1022	return 0;
1023	}
1024	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1025	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1026	return 1;
1027	}
1028
1029	# Convert a PDF file to text with the pdftotext command
1030
1031	sub pdf_to_text {
1032	my ($dirname, $input_filename, $output_filestem) = @_;
1033
1034	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1035
1036	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1037	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1038	} else {
1039	$cmd .= " > \"$output_filestem.err\"";
1040	}
1041
1042	if (system($cmd)!=0)
1043	{
1044	print STDERR "Error executing $cmd: $!\n";
1045	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1046	}
1047
1048	# make sure there is some extracted text.
1049	if (-e "$output_filestem.text") {
1050	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
1051	binmode(EXTR_TEXT); # just in case...
1052	my $line="";
1053	my $seen_text=0;
1054	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1055	if ($line=~ m/\w/) {$seen_text=1;}
1056	}
1057	close EXTR_TEXT;
1058	if ($seen_text==0) { # no text was extracted
1059	print STDERR "Error: pdftotext found no text\n";
1060	&FileUtils::removeFiles("$output_filestem.text");
1061	}
1062	}
1063
1064	# make sure the converter made something
1065	if (! -s "$output_filestem.text")
1066	{
1067	# print out the converters std err, if any
1068	if (-s "$output_filestem.err") {
1069	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1070	print STDERR "pdftotext error log:\n";
1071	while (<ERRLOG>) {
1072	print STDERR "$_";
1073	}
1074	close ERRLOG;
1075	}
1076	# does this converter create a .out file?
1077	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1078	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1079	if (-e "$output_filestem.err") {
1080	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1081	{
1082	open (ERRLOG,"$output_filestem.err");
1083	while (<ERRLOG>) {print FAILLOG $_;}
1084	close ERRLOG;
1085	close FAILLOG;
1086	}
1087	&FileUtils::removeFiles("$output_filestem.err");
1088	}
1089	return 0;
1090	}
1091	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1092	return 1;
1093	}
1094
1095	# Convert a PostScript document to text
1096	# note - just using "ps2ascii" isn't good enough, as it
1097	# returns 0 for a postscript interpreter error. ps2ascii is just
1098	# a wrapper to "gs" anyway, so we use that cmd here.
1099
1100	sub ps_to_text {
1101	my ($input_filename, $output_filestem) = @_;
1102
1103	my $error = "";
1104
1105	# if we're on windows we'll fall straight through without attempting
1106	# to use gs
1107	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1108	$error = "Windows does not support gs";
1109
1110	} else {
1111	my $cmd = "";
1112	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1113	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1114	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1115	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1116	$cmd .= " 2> $output_filestem.err";
1117	$!=0;
1118
1119	my $retcode=system($cmd);
1120	$retcode = $? >> 8; # see man perlfunc - system for this...
1121	# if system returns -1 \| 127 (couldn't start program), look at $! for message
1122
1123	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1124	elsif (! -e "$output_filestem.text") {
1125	$error="did not create output file.\n";
1126	}
1127	else
1128	{ # make sure the interpreter didn't get an error. It is technically
1129	# possible for the actual text to start with this, but....
1130	open PSOUT, "$output_filestem.text";
1131	if (<PSOUT> =~ m/^Error: (.*)/) {
1132	$error="interpreter error - \"$1\"";
1133	}
1134	close PSOUT;
1135	}
1136	}
1137
1138	if ($error ne "")
1139	{
1140	print STDERR "Warning: Error executing gs: $error\n";
1141	print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
1142	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1143
1144	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1145	{
1146	print FAILLOG "gs - $error\n";
1147	if (-e "$output_filestem.err") {
1148	open(ERRLOG, "$output_filestem.err");
1149	while (<ERRLOG>) {print FAILLOG $_;}
1150	close ERRLOG;
1151	}
1152	close FAILLOG;
1153	}
1154	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1155
1156
1157	# Fine then. We'll just do a lousy job by ourselves...
1158	# Based on 5-line regexp sed script found at:
1159	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1160	#
1161	print STDERR "Stripping text from postscript\n";
1162	my $errorcode=0;
1163	open (IN, "$input_filename")
1164	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1165	open (OUT, ">$output_filestem.text")
1166	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1167	if ($errorcode) {print STDERR "errors\n";return 0;}
1168
1169	my $text=""; # this is for whole .ps file...
1170	$text = join('', <IN>); # see man perlport, under "System Resources"
1171	close IN;
1172
1173	# Make sure this is a ps file...
1174	if ($text !~ m/^%!/) {
1175	print STDERR "Bad postscript header: not '%!'\n";
1176	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1177	{
1178	print FAILLOG "Bad postscript header: not '%!'\n";
1179	close FAILLOG;
1180	}
1181	return 0;
1182	}
1183
1184	# if ps has Page data, then use it to delete all stuff before it.
1185	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1186
1187	# remove all leading non-data stuff
1188	$text =~ s/^.*?\(//s;
1189
1190	# remove all newline chars for easier processing
1191	$text =~ s/\n//g;
1192
1193	# Big assumption here - assume that if any co-ordinates are
1194	# given, then we are at the end of a sentence.
1195	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1196
1197	# special characters--
1198	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1199
1200	# ? ps text formatting (eg italics?) ?
1201	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1202	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1203	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1204	# default - remove the rest
1205	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1206
1207	# attempt to add whitespace between words...
1208	# this is based purely on observation, and may be completely wrong...
1209	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1210	# eg I notice "b(" is sometimes NOT a space if preceded by a
1211	# negative number.
1212	$text =~ s/\)\d+ ?b\(/\) \( /g;
1213
1214	# change quoted braces to brackets
1215	$text =~ s/([^\\])\\\(/$1\{/g;
1216	$text =~ s/([^\\])\\\)/$1\}/g ;
1217
1218	# remove everything that is not between braces
1219	$text =~ s/\)([^\(\)])+?\(//sg ;
1220
1221	# remove any Trailer eof stuff.
1222	$text =~ s/\)[^\)]*$//sg;
1223
1224	### ligatures have special characters...
1225	$text =~ s/\\013/ff/g;
1226	$text =~ s/\\014/fi/g;
1227	$text =~ s/\\015/fl/g;
1228	$text =~ s/\\016/ffi/g;
1229	$text =~ s/\\214/fi/g;
1230	$text =~ s/\\215/fl/g;
1231	$text =~ s/\\017/\n\* /g; # asterisk?
1232	$text =~ s/\\023/\023/g; # e acute ('e)
1233	$text =~ s/\\177/\252/g; # u"
1234	# $text =~ s/ ?? /\344/g; # a"
1235
1236	print OUT "$text";
1237	close OUT;
1238	}
1239	# wrap the text - use a minimum length. ie, first space after this length.
1240	my $wrap_length=72;
1241	&FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1242	open INFILE, "$output_filestem.text.tmp" \|\|
1243	die "Couldn't open file: $!";
1244	open OUTFILE, ">$output_filestem.text" \|\|
1245	die "Couldn't open file for writing: $!";
1246	my $line="";
1247	while ($line=<INFILE>) {
1248	while (length($line)>0) {
1249	if (length($line)>$wrap_length) {
1250	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1251	print OUTFILE "$1\n";
1252	} else {
1253	print OUTFILE "$line";
1254	$line="";
1255	}
1256	}
1257	}
1258	close INFILE;
1259	close OUTFILE;
1260	&FileUtils::removeFiles("$output_filestem.text.tmp");
1261
1262	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1263	return 1;
1264	}
1265
1266
1267	# Convert any file to HTML with a crude perl implementation of the
1268	# UNIX strings command.
1269
1270	sub any_to_html {
1271	my ($input_filename, $output_filestem) = @_;
1272
1273	# First generate a text file
1274	return 0 unless (&any_to_text($input_filename, $output_filestem));
1275
1276	# create an HTML file from the text file
1277	open(TEXT, "<$output_filestem.text");
1278	open(HTML, ">$output_filestem.html");
1279
1280	print HTML "<html><head>\n";
1281	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1282	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1283	print HTML "</head><body>\n\n";
1284
1285	my $line;
1286	while ($line=<TEXT>) {
1287	$line =~ s/</</g;
1288	$line =~ s/>/>/g;
1289	if ($line =~ m/^\s*$/) {
1290	print HTML "<p>";
1291	} else {
1292	print HTML "<br> ", $line;
1293	}
1294	}
1295	print HTML "\n</body></html>\n";
1296
1297	close HTML;
1298	close TEXT;
1299
1300	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1301	return 1;
1302	}
1303
1304	# Convert any file to TEXT with a crude perl implementation of the
1305	# UNIX strings command.
1306	# Note - this assumes ascii charsets :( (jrm21)
1307
1308	sub any_to_text {
1309	my ($input_filename, $output_filestem) = @_;
1310
1311	if (!$use_strings) {
1312	return 0;
1313	}
1314
1315	print STDERR "\n** In any to text**\n\n";
1316	open(IN, "<$input_filename") \|\| return 0;
1317	binmode(IN);
1318	open(OUT, ">$output_filestem.text") \|\| return 0;
1319
1320	my ($line);
1321	my $output_line_count = 0;
1322	while (<IN>) {
1323	$line = $_;
1324
1325	# delete anything that isn't a printable character
1326	$line =~ s/[^\040-\176]+/\n/sg;
1327
1328	# delete any string less than 10 characters long
1329	$line =~ s/^.{0,9}$/\n/mg;
1330	while ($line =~ m/^.{1,9}$/m) {
1331	$line =~ s/^.{0,9}$/\n/mg;
1332	$line =~ s/\n+/\n/sg;
1333	}
1334
1335	# remove extraneous whitespace
1336	$line =~ s/\n+/\n/gs;
1337	$line =~ s/^\n//gs;
1338
1339	# output whatever is left
1340	if ($line =~ m/[^\n ]/) {
1341	print OUT $line;
1342	++$output_line_count;
1343	}
1344	}
1345
1346	close OUT;
1347	close IN;
1348
1349	if ($output_line_count) { # try to protect against binary only formats
1350	return 1;
1351	}
1352
1353	&FileUtils::removeFiles("$output_filestem.text");
1354	return 0;
1355
1356	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: