Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 32223

Last change on this file since 32223 was 32223, checked in by ak19, 6 years ago
When no output mode for PDFPlugin has been set by the user, the output mode now defaults to paged_html (previously html). paged_html uses xpdftools to do the PDF conversion, which will apparently work for all versions of PDF so it gives better version coverage than the old pdftohtml.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 41.7 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use strict;
52
53	use parsargv;
54	use util;
55	use FileUtils;
56	use Cwd;
57
58	# Are we running on WinNT or Win2000 (or later)?
59	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62	my $use_strings;
63	my $pdf_complex;
64	my $pdf_nohidden;
65	my $pdf_zoom;
66	my $pdf_ignore_images;
67	my $pdf_allow_images_only;
68	my $windows_scripting;
69
70	sub print_usage
71	{
72	print STDERR "\n";
73	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
74	print STDERR " or text using third-party programs.\n\n";
75	print STDERR " usage: $0 [options] filename\n";
76	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
77	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
78	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
79	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
80	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
81	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
82	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
83	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
84	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
85	print STDERR "\t\tconverting PDF to HTML\n";
86	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
87	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
88	print STDERR "\t\t-pdf_complex is set\n";
89	exit(1);
90	}
91
92	my $faillogfile="";
93	my $timeout=0;
94	my $verbosity=0;
95
96	sub main
97	{
98	my (@ARGV) = @_;
99	my ($input_type,$output_type,$verbose);
100
101	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
102	# is in use or not
103	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
104	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
105	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
106	# Currently only have VBA for Word and PPT(but no XLS)
107	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
108
109	my $type_re = $default_type_re;
110
111	foreach my $a (@ARGV) {
112	if ($a =~ m/^windows_scripting$/i) {
113	$type_re = $enhanced_type_re;
114	}
115	}
116
117	# read command-line arguments
118	if (!parsargv::parse(\@ARGV,
119	"type/$type_re/", \$input_type,
120	'/errlog/.*/', \$faillogfile,
121	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
122	'timeout/\d+/0',\$timeout,
123	'verbose/\d+/0', \$verbose,
124	'windows_scripting',\$windows_scripting,
125	'use_strings', \$use_strings,
126	'pdf_complex', \$pdf_complex,
127	'pdf_ignore_images', \$pdf_ignore_images,
128	'pdf_allow_images_only', \$pdf_allow_images_only,
129	'pdf_nohidden', \$pdf_nohidden,
130	'pdf_zoom/\d+/2', \$pdf_zoom
131	))
132	{
133	print_usage();
134	}
135
136	$verbosity=$verbose if defined $verbose;
137
138	# Make sure the input file exists and can be opened for reading
139	if (scalar(@ARGV!=1)) {
140	print_usage();
141	}
142
143	my $input_filename = $ARGV[0];
144	if (!-r $input_filename) {
145	print STDERR "Error: unable to open $input_filename for reading\n";
146	exit(1);
147	}
148
149	# Deduce filenames
150	my ($tailname,$dirname,$suffix)
151	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
152	my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
153
154	if ($input_type eq "")
155	{
156	$input_type = lc (substr($suffix,1,length($suffix)-1));
157	}
158
159	# Change to temporary working directory
160	my $stored_dir = cwd();
161	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
162
163	# Select convert utility
164	if (!defined $input_type) {
165	print STDERR "Error: No filename extension or input type defined\n";
166	exit(1);
167	}
168	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
169	print &convertDOC($input_filename, $output_filestem, $output_type);
170	print "\n";
171	}
172	elsif ($input_type eq "rtf") {
173	print &convertRTF($input_filename, $output_filestem, $output_type);
174	print "\n";
175	}
176	elsif ($input_type eq "pdf") {
177	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
178	print "\n";
179	}
180	elsif ($input_type eq "ps") {
181	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
182	print "\n";
183	}
184	elsif ($input_type =~ m/pptx?$/) {
185	print &convertPPT($input_filename, $output_filestem, $output_type);
186	print "\n";
187	}
188	elsif ($input_type =~ m/xlsx?$/) {
189	print &convertXLS($input_filename, $output_filestem, $output_type);
190	print "\n";
191	}
192	else {
193	print STDERR "Error: Unable to convert type '$input_type'\n";
194	exit(1);
195	}
196
197	# restore to original working directory
198	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
199
200	}
201
202	&main(@ARGV);
203
204
205
206	# Document-type conversion functions
207	#
208	# The following functions attempt to convert documents from their
209	# input type to the specified output type. If no output type was
210	# given, then they first attempt HTML, and then TEXT.
211	#
212	# Each returns the output type ("html" or "text") or "fail" if no
213	# conversion is possible.
214
215	# Convert a Microsoft word document
216
217	sub convertDOC {
218	my ($input_filename, $output_filestem, $output_type) = @_;
219
220	# Many .doc files are not in fact word documents!
221	my $realtype = &find_docfile_type($input_filename);
222
223	if ($realtype eq "word6" \|\| $realtype eq "word7"
224	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
225	return &convertWord678($input_filename, $output_filestem, $output_type);
226	} elsif ($realtype eq "rtf") {
227	return &convertRTF($input_filename, $output_filestem, $output_type);
228	} else {
229	return &convertAnything($input_filename, $output_filestem, $output_type);
230	}
231	}
232
233	# Convert a Microsoft word 6/7/8 document
234
235	sub convertWord678 {
236	my ($input_filename, $output_filestem, $output_type) = @_;
237
238	my $success = 0;
239	if (!$output_type \|\| ($output_type =~ m/html/i)){
240	if ($windows_scripting) {
241	$success = &native_doc_to_html($input_filename, $output_filestem);
242	}
243	else {
244	$success = &doc_to_html($input_filename, $output_filestem);
245	}
246	if ($success) {
247	return "html";
248	}
249	}
250	return &convertAnything($input_filename, $output_filestem, $output_type);
251	}
252
253
254	# Convert a Rich Text Format (RTF) file
255
256	sub convertRTF {
257	my ($input_filename, $output_filestem, $output_type) = @_;
258
259	my $success = 0;
260
261	# Attempt specialised conversion to HTML
262	if (!$output_type \|\| ($output_type =~ m/html/i)) {
263
264	if ($windows_scripting) {
265	$success = &native_doc_to_html($input_filename, $output_filestem);
266	}
267	else {
268	$success = &rtf_to_html($input_filename, $output_filestem);
269	}
270	if ($success) {
271	return "html";
272	}
273	}
274
275	# rtf is so ugly that's it's not worth running strings over.
276	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
277	# return &convertAnything($input_filename, $output_filestem, $output_type);
278	return "fail";
279	}
280
281
282	# Convert an unidentified file
283
284	sub convertAnything {
285	my ($input_filename, $output_filestem, $output_type) = @_;
286
287	my $success = 0;
288
289	# Attempt simple conversion to HTML
290	if (!$output_type \|\| ($output_type =~ m/html/i)) {
291	$success = &any_to_html($input_filename, $output_filestem);
292	if ($success) {
293	return "html";
294	}
295	}
296
297	# Convert to text
298	if (!$output_type \|\| ($output_type =~ m/text/i)) {
299	$success = &any_to_text($input_filename, $output_filestem);
300	if ($success) {
301	return "text";
302	}
303	}
304	return "fail";
305	}
306
307
308
309	# Convert an Adobe PDF document
310
311	sub convertPDF {
312	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
313
314	my $success = 0;
315	$output_type =~ s/.\-(.)/$1/i;
316	# Attempt coversion to Image
317	if ($output_type =~ m/jp?g\|gif\|png/i) {
318	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
319	if ($success){
320	return "item";
321	}
322	}
323
324	# Attempt conversion to HTML
325	# Uses the old pdftohtml that doesn't work for newer PDF versions
326	if ($output_type =~ m/^html/i) {
327	#if (!$output_type \|\| ($output_type =~ m/^html/i)) {
328	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
329	if ($success) {
330	return "html";
331	}
332	}
333
334	# Attempt conversion to (paged) HTML using the newer pdftohtml of Xpdftools. This
335	# will be the new default for PDFs when output_type for PDF docs is not specified
336	# (once our use of xpdftools' pdftohtml has been implemented on win and mac).
337	#if ($output_type =~ m/paged_html/i) {
338	if (!$output_type \|\| ($output_type =~ m/paged_html/i)) {
339	$success = &xpdf_to_html($dirname, $input_filename, $output_filestem);
340	if ($success) {
341	return "paged_html";
342	}
343	}
344
345	# Attempt conversion to TEXT
346	if (!$output_type \|\| ($output_type =~ m/text/i)) {
347	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
348	if ($success) {
349	return "text";
350	}
351	}
352
353	return "fail";
354
355	}
356
357
358	# Convert an Adobe PostScript document
359
360	sub convertPS {
361	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
362
363	my $success = 0;
364	$output_type =~ s/.\-(.)/$1/i;
365	# Attempt coversion to Image
366	if ($output_type =~ m/jp?g\|gif\|png/i) {
367	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
368	if ($success){
369	return "item";
370	}
371	}
372
373	# Attempt conversion to TEXT
374	if (!$output_type \|\| ($output_type =~ m/text/i)) {
375	$success = &ps_to_text($input_filename, $output_filestem);
376	if ($success) {
377	return "text";
378	}
379	}
380	return "fail";
381	}
382
383
384	sub convertPPT {
385	my ($input_filename, $output_filestem, $output_type) = @_;
386	my $success = 0;
387
388	my $ppt_convert_type = "";
389
390	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
391	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
392	if ($output_type =~ m/gif/i) {
393	$ppt_convert_type = "-g";
394	} elsif ($output_type =~ m/jp?g/i){
395	$ppt_convert_type = "-j";
396	} elsif ($output_type =~ m/png/i){
397	$ppt_convert_type = "-p";
398	}
399	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
400	$ENV{'GSDLOS'}, "pptextract");
401	$vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
402	# $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
403
404	my $cmd = "";
405	if ($timeout) {$cmd = "ulimit -t $timeout;";}
406	# if the converting directory already exists
407	if (-d $output_filestem) {
408	print STDERR "**The conversion directory already exists\n";
409	return "item";
410	} else {
411	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
412	$cmd .= " 2>\"$output_filestem.err\""
413	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
414
415	if (system($cmd) !=0) {
416	print STDERR "Powerpoint VB Scripting convert failed\n";
417	} else {
418	return "item";
419	}
420	}
421	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
422	# Attempt conversion to HTML
423	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
424	# formulate the command
425	my $cmd = "";
426	my $full_perl_path = &util::get_perl_exec();
427	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
428	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
429	$cmd .= " 2>\"$output_filestem.err\""
430	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
431
432	# execute the command
433	$!=0;
434	if (system($cmd)!=0)
435	{
436	print STDERR "Powerpoint 95/97 converter failed $!\n";
437	} else {
438	return "html";
439	}
440	}
441
442	$success = &any_to_text($input_filename, $output_filestem);
443	if ($success) {
444	return "text";
445	}
446
447	return "fail";
448	}
449
450
451	sub convertXLS {
452	my ($input_filename, $output_filestem, $output_type) = @_;
453
454	my $success = 0;
455
456	# Attempt conversion to HTML
457	if (!$output_type \|\| ($output_type =~ m/html/i)) {
458	# formulate the command
459	my $cmd = "";
460	my $full_perl_path = &util::get_perl_exec();
461	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
462	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
463	$cmd .= " 2>\"$output_filestem.err\""
464	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
465
466
467	# execute the command
468	$!=0;
469	if (system($cmd)!=0)
470	{
471	print STDERR "Excel 95/97 converter failed $!\n";
472	} else {
473	return "html";
474	}
475	}
476
477	$success = &any_to_text($input_filename, $output_filestem);
478	if ($success) {
479	return "text";
480	}
481
482	return "fail";
483	}
484
485
486
487	# Find the real type of a .doc file
488	#
489	# We seem to have a lot of files with a .doc extension that are .rtf
490	# files or Word 5 files. This function attempts to tell the difference.
491	sub find_docfile_type {
492	my ($input_filename) = @_;
493
494	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
495	return "docx";
496	}
497
498	open(CHK, "<$input_filename");
499	binmode(CHK);
500	my $line = "";
501	my $first = 1;
502
503	while (<CHK>) {
504
505	$line = $_;
506
507	if ($first) {
508	# check to see if this is an rtf file
509	if ($line =~ m/^\{\\rtf/) {
510	close(CHK);
511	return "rtf";
512	}
513	$first = 0;
514	}
515
516	# is this is a word 6/7/8 document?
517	if ($line =~ m/Word\.Document\.([678])/) {
518	close(CHK);
519
520	return "word$1";
521	}
522
523	}
524
525	return "unknown";
526	}
527
528
529	# Specific type-to-type conversions
530	#
531	# Each of the following functions attempts to convert a document from
532	# a specific format to another. If they succeed they return 1 and leave
533	# the output document(s) in the appropriate place; if they fail they
534	# return 0 and delete any working files.
535
536
537	# Attempt to convert a word document to html with the wv program
538	sub doc_to_html {
539	my ($input_filename, $output_filestem) = @_;
540
541	my $wvware_status = 0;
542
543	# need to ensure that the path to perl is quoted (in case there's spaces in it)
544	my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
545
546	# print STDERR "***** wvware launch cmd = $launch_cmd\n";
547
548	$wvware_status = system($launch_cmd)/256;
549	return $wvware_status;
550	}
551
552	# Attempt to convert a word document to html with the word2html scripting program
553	sub native_doc_to_html {
554	my ($input_filename, $output_filestem) = @_;
555
556	# build up the path to the doc-to-html conversion tool we're going to use
557	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
558
559	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
560	# if windows scripting with docx input, use new VBscript to get the local Word install (if
561	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
562
563	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
564	# else script launch fails when there are error msgs
565	$vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
566	$vbScript = "CScript //Nologo \"$vbScript\""; # launch with CScript for error output in STDERR
567	# //Nologo flag avoids Microsoft's opening/logo msgs
568	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
569	print STDERR " This may take some time. Please wait...\n";
570	}
571	else { # old doc versions. use the usual VB executable word2html for the
572	# conversion. Doesn't need full path, since bin\windows is on PATH
573	$vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
574	}
575	}
576	else { # not windows
577	$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
578	}
579
580	if (-e "$output_filestem.html") {
581	print STDERR " The conversion file:\n";
582	print STDERR " $output_filestem.html\n";
583	print STDERR " ... already exists. Skipping\n";
584	return 1;
585	}
586
587	my $cmd = "";
588	if ($timeout) {$cmd = "ulimit -t $timeout;";}
589	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
590	#$cmd .= "$vbScript $input_filename $output_filestem.html";
591	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
592
593	# redirecting STDERR
594
595	$cmd .= " 2> \"$output_filestem.err\""
596	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
597	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
598
599	# execute the command
600	$!=0;
601	if (system($cmd)!=0)
602	{
603	print STDERR "Error executing $vbScript converter:$!\n";
604	if (-s "$output_filestem.err") {
605	open (ERRFILE, "<$output_filestem.err");
606
607	my $write_to_fail_log=0;
608	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
609	{$write_to_fail_log=1;}
610
611	my $line;
612	while ($line=<ERRFILE>) {
613	if ($line =~ m/\w/) {
614	print STDERR "$line";
615	print FAILLOG "$line" if ($write_to_fail_log);
616	}
617	if ($line !~ m/startup error/) {next;}
618	print STDERR " (given an invalid .DOC file?)\n";
619	print FAILLOG " (given an invalid .DOC file?)\n"
620	if ($write_to_fail_log);
621
622	} # while ERRFILE
623	close FAILLOG if ($write_to_fail_log);
624	}
625	return 0; # we can try any_to_text
626	}
627
628	# Was the conversion successful?
629	if (-s "$output_filestem.html") {
630	open(TMP, "$output_filestem.html");
631	my $line = <TMP>;
632	close(TMP);
633	if ($line && $line =~ m/html/i) {
634	&FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
635	return 1;
636	}
637	}
638
639	# If here, an error of some sort occurred
640	&FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
641	if (-e "$output_filestem.err") {
642	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
643	open (ERRLOG,"$output_filestem.err");
644	while (<ERRLOG>) {print FAILLOG $_;}
645	close FAILLOG;
646	close ERRLOG;
647	}
648	&FileUtils::removeFiles("$output_filestem.err");
649	}
650	return 0;
651	}
652
653	# Attempt to convert an RTF document to html with rtftohtml
654	sub rtf_to_html {
655	my ($input_filename, $output_filestem) = @_;
656
657	# formulate the command
658	my $cmd = "";
659	if ($timeout) {$cmd = "ulimit -t $timeout;";}
660	$cmd .= "rtftohtml";
661	#$cmd .= "rtf-converter";
662
663	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
664
665	$cmd .= " 2>\"$output_filestem.err\""
666	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
667
668
669	# execute the command
670	$!=0;
671	if (system($cmd)!=0)
672	{
673	print STDERR "Error executing rtf converter $!\n";
674	# don't currently bother printing out error log...
675	# keep going, in case it still created an HTML file...
676	}
677
678	# Was the conversion successful?
679	my $was_successful=0;
680	if (-s "$output_filestem.html") {
681	# make sure we have some content other than header
682	open (HTML, "$output_filestem.html"); # what to do if fail?
683	my $line;
684	my $past_header=0;
685	while ($line=<HTML>) {
686
687	if ($past_header == 0) {
688	if ($line =~ m/<body>/) {$past_header=1;}
689	next;
690	}
691
692	$line =~ s/<[^>]+>//g;
693	if ($line =~ m/\w/ && $past_header) { # we found some content...
694	$was_successful=1;
695	last;
696	}
697	}
698	close HTML;
699	}
700
701	if ($was_successful) {
702	&FileUtils::removeFiles("$output_filestem.err")
703	if (-e "$output_filestem.err");
704	# insert the (modified) table of contents, if it exists.
705	if (-e "${output_filestem}_ToC.html") {
706	&FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
707	my $open_failed=0;
708	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
709	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
710	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
711
712	if ($open_failed) {
713	close HTMLSRC;
714	close TOC;
715	close HTML;
716	&FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
717	return 1;
718	}
719
720	# print out header info from src html.
721	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
722	print HTML "$_";
723	}
724
725	# print out table of contents, making links relative
726	<TOC>; <TOC>; # ignore first 2 lines
727	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
728	my $line;
729	while ($line=<TOC>) {
730	$line =~ s@</body></html>$@@i ; # only last line has this
731	# make link relative
732	$line =~ s@href=\"[^\#]+@href=\"@i;
733	print HTML $line;
734	}
735	close TOC;
736
737	# rest of html src
738	while (<HTMLSRC>) {
739	print HTML $_;
740	}
741	close HTMLSRC;
742	close HTML;
743
744	&FileUtils::removeFiles("${output_filestem}_ToC.html");
745	&FileUtils::removeFiles("${output_filestem}.src");
746	}
747	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
748	return 1; # success
749	}
750
751	if (-e "$output_filestem.err") {
752	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
753	{
754	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
755	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
756	print FAILLOG " (rtf file might be too recent):\n";
757	open (ERRLOG, "$output_filestem.err");
758	while (<ERRLOG>) {print FAILLOG $_;}
759	close ERRLOG;
760	close FAILLOG;
761	}
762	&FileUtils::removeFiles("$output_filestem.err");
763	}
764
765	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
766
767	return 0;
768	}
769
770
771	# Convert a pdf file to html with the old pdftohtml command
772	# which only works for older PDF versions
773	sub pdf_to_html {
774	my ($dirname, $input_filename, $output_filestem) = @_;
775
776	my $cmd = "";
777	if ($timeout) {$cmd = "ulimit -t $timeout;";}
778	my $full_perl_path = &util::get_perl_exec();
779	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
780	$cmd .= " -c" if ($pdf_complex);
781	$cmd .= " -i" if ($pdf_ignore_images);
782	$cmd .= " -a" if ($pdf_allow_images_only);
783	$cmd .= " -hidden" unless ($pdf_nohidden);
784	$cmd .= " \"$input_filename\" \"$output_filestem\"";
785
786	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
787	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
788	} else {
789	$cmd .= " > \"$output_filestem.err\"";
790	}
791
792	$!=0;
793
794	my $retval=system($cmd);
795	if ($retval!=0)
796	{
797	print STDERR "Error executing pdftohtml.pl";
798	if ($!) {print STDERR ": $!";}
799	print STDERR "\n";
800	}
801
802	# make sure the converter made something
803	if ($retval!=0 \|\| ! -s "$output_filestem.html")
804	{
805	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
806	# print out the converter's std err, if any
807	if (-s "$output_filestem.err") {
808	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
809	print STDERR "pdftohtml error log:\n";
810	while (<ERRLOG>) {
811	print STDERR "$_";
812	}
813	close ERRLOG;
814	}
815	#print STDERR "***********output filestem $output_filestem.html\n";
816	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
817	if (-e "$output_filestem.err") {
818	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
819	{
820	open (ERRLOG, "$output_filestem.err");
821	while (<ERRLOG>) {print FAILLOG $_;}
822	close ERRLOG;
823	close FAILLOG;
824	}
825	&FileUtils::removeFiles("$output_filestem.err");
826	}
827	return 0;
828	}
829
830	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
831	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
832	return 1;
833	}
834
835
836	# Convert a pdf file to html with the newer Xpdftools' pdftohtml
837	# This generates "paged HTML" where extracted, selectable text is positioned
838	# over screenshots of each page.
839	# Since xpdf's pdftohtml fails if the output dir already exists and for easier
840	# naming, the output files are created in a "pages" subdirectory of the tmp
841	# location parent of $output_filestem instead
842	sub xpdf_to_html {
843	my ($dirname, $input_filename, $output_filestem) = @_;
844
845	my $cmd = "";
846
847	# build up the path to the doc-to-html conversion tool we're going to use
848	my $xpdf_pdftohtml = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "xpdf-tools");
849
850	if($ENV{'GSDLOS'} =~ m/^windows$/i) { # For Windows, just use the 32 bit xpdf's pdftohtml as it works the same as the 64 bit
851	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32");
852	} else { # unix (linux\|darwin), use the bin32/bin64 folder depending on the BITNESS env var
853
854	# Don't use $ENV{'GSDLARCH'}, use the new $ENV{'BITNESS'}, since
855	# $ENV{'GSDLARCH'} is only (meant to be) set when many other 32-bit or 64-bit
856	# specific subdirectories exist in a greenstone installation.
857	# None of those locations need exist when xpdf-tools is installed with GS.
858	# So don't depend on GSDLARCH as forcing that to be exported has side-effects
859	if($ENV{'BITNESS'}) {
860	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin".$ENV{'BITNESS'});
861	} else { # what if $ENV{'BITNESS'} undefined, fallback on bin32? or 64?
862	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "bin32");
863	}
864	}
865
866	# We'll create the file by name $output_filestem during post-conversion processing.
867	# Note that Xpdf tools will only create its conversion products in a dir that does
868	# not yet exist. So we'll create this location as a subdir of the output_filestem's
869	# parent directory. The parent dir is the already generated tmp area for conversion. So:
870	# - tmpdir gs2build/tmp/<random-num> already exists at this stage
871	# - We'll create gs2build/tmp/<rand>/output_filestem.html later, during post-processing
872	# - For now, XPdftools will create gs2build/tmp/<rand>/pages and put its products in there.
873	my ($tailname, $tmp_dirname, $suffix)
874	= &File::Basename::fileparse($output_filestem, "\\.[^\\.]+\$");
875	$tmp_dirname = &FileUtils::filenameConcatenate($tmp_dirname, "pages");
876
877	$xpdf_pdftohtml = &FileUtils::filenameConcatenate($xpdf_pdftohtml, "pdftohtml");
878	# xpdf's pdftohtml tool also takes a zoom factor, where a zoom of 1 is 100%
879	$cmd .= "\"$xpdf_pdftohtml\"";
880	$cmd .= " -z $pdf_zoom" if ($pdf_zoom);
881	# $cmd .= " -c" if ($pdf_complex);
882	# $cmd .= " -i" if ($pdf_ignore_images);
883	# $cmd .= " -a" if ($pdf_allow_images_only);
884	# $cmd .= " -hidden" unless ($pdf_nohidden);
885	$cmd .= " \"$input_filename\" \"$tmp_dirname\"";
886	#$cmd .= " \"$input_filename\" \"$output_filestem\"";
887
888	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
889	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
890	} else {
891	$cmd .= " > \"$output_filestem.err\"";
892	}
893
894	#print STDERR "@@@@ Running command: $cmd\n";
895
896	$!=0;
897	my $retval=system($cmd);
898	if ($retval!=0)
899	{
900	print STDERR "Error executing xpdf's pdftohtml tool";
901	if ($!) {print STDERR ": $!";}
902	print STDERR "\n";
903	}
904
905	# make sure the converter made something
906	if ($retval!=0 \|\| ! -s &FileUtils::filenameConcatenate($tmp_dirname,"index.html"))
907	{
908	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
909	# print out the converter's std err, if any
910	if (-s "$output_filestem.err") {
911	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
912	print STDERR "pdftohtml error log:\n";
913	while (<ERRLOG>) {
914	print STDERR "$_";
915	}
916	close ERRLOG;
917	}
918	#print STDERR "***********output filestem $output_filestem.html\n";
919	&FileUtils::removeFiles("$tmp_dirname") if (-d "$tmp_dirname");
920	if (-e "$output_filestem.err") {
921	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
922	{
923	open (ERRLOG, "$output_filestem.err");
924	while (<ERRLOG>) {print FAILLOG $_;}
925	close ERRLOG;
926	close FAILLOG;
927	}
928	&FileUtils::removeFiles("$output_filestem.err");
929	}
930	return 0;
931	}
932
933	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
934	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
935	return 1;
936	}
937
938
939
940	# Convert a pdf file to various types of image with the convert command
941
942	sub pdfps_to_img {
943	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
944
945	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
946	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
947	my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
948	$imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
949	my $result = `$imagick_cmd identify 2>&1`;
950
951	# Linux and Windows return different values for "program not found".
952	# Linux returns -1 and Windows 256 for "program not found". But once they're
953	# converted to signed values, it will be -1 for Linux and 1 for Windows.
954	# Whenever we test for return values other than 0, shift by 8 and perform
955	# unsigned to signed status conversion on $? to get expected range of return vals
956	# Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
957	# and then exits on that, by the time we get here, we need to do it again
958	my $status = $?;
959	$status >>= 8;
960	$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
961	if (($ENV{'GSDLOS'} ne "windows" && $status == -1) \|\| ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
962	# if ($status == -1 \|\| $status == 1) #if ($status == -1 \|\| $status == 256) {
963	#ImageMagick is not installed, thus the convert utility is not available.
964	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
965	return 0;
966	}
967	}
968
969	my $cmd = "";
970	if ($timeout) {$cmd = "ulimit -t $timeout;";}
971	$output_type =~ s/.\_(.)/$1/i;
972	my $full_perl_path = &util::get_perl_exec();
973	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
974	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
975	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
976	} else {
977	$cmd .= " > \"$output_filestem.err\"";
978	}
979
980	# don't include path on windows (to avoid having to play about
981	# with quoting when GSDLHOME might contain spaces) but assume
982	# that the PATH is set up correctly
983	$!=0;
984	my $retval=system($cmd);
985	if ($retval!=0)
986	{
987	print STDERR "Error executing pdfpstoimg.pl";
988	if ($!) {print STDERR ": $!";}
989	print STDERR "\n";
990	}
991
992	#make sure the converter made something
993	#if ($retval !=0) \|\| ! -s "$output_filestem")
994	if ($retval !=0)
995	{
996	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
997	#print out the converter's std err, if any
998	if (-s "$output_filestem.err") {
999	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1000	print STDERR "pdfpstoimg error log:\n";
1001	while (<ERRLOG>) {
1002	print STDERR "$_";
1003	}
1004	close ERRLOG;
1005	}
1006	#&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
1007	if (-e "$output_filestem.err") {
1008	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1009	{
1010	open (ERRLOG, "$output_filestem.err");
1011	while (<ERRLOG>) {print FAILLOG $_;}
1012	close ERRLOG;
1013	close FAILLOG;
1014	}
1015	&FileUtils::removeFiles("$output_filestem.err");
1016	}
1017	return 0;
1018	}
1019	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1020	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1021	return 1;
1022	}
1023
1024	# Convert a PDF file to text with the pdftotext command
1025
1026	sub pdf_to_text {
1027	my ($dirname, $input_filename, $output_filestem) = @_;
1028
1029	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1030
1031	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1032	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1033	} else {
1034	$cmd .= " > \"$output_filestem.err\"";
1035	}
1036
1037	if (system($cmd)!=0)
1038	{
1039	print STDERR "Error executing $cmd: $!\n";
1040	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1041	}
1042
1043	# make sure there is some extracted text.
1044	if (-e "$output_filestem.text") {
1045	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
1046	binmode(EXTR_TEXT); # just in case...
1047	my $line="";
1048	my $seen_text=0;
1049	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1050	if ($line=~ m/\w/) {$seen_text=1;}
1051	}
1052	close EXTR_TEXT;
1053	if ($seen_text==0) { # no text was extracted
1054	print STDERR "Error: pdftotext found no text\n";
1055	&FileUtils::removeFiles("$output_filestem.text");
1056	}
1057	}
1058
1059	# make sure the converter made something
1060	if (! -s "$output_filestem.text")
1061	{
1062	# print out the converters std err, if any
1063	if (-s "$output_filestem.err") {
1064	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1065	print STDERR "pdftotext error log:\n";
1066	while (<ERRLOG>) {
1067	print STDERR "$_";
1068	}
1069	close ERRLOG;
1070	}
1071	# does this converter create a .out file?
1072	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
1073	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1074	if (-e "$output_filestem.err") {
1075	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1076	{
1077	open (ERRLOG,"$output_filestem.err");
1078	while (<ERRLOG>) {print FAILLOG $_;}
1079	close ERRLOG;
1080	close FAILLOG;
1081	}
1082	&FileUtils::removeFiles("$output_filestem.err");
1083	}
1084	return 0;
1085	}
1086	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1087	return 1;
1088	}
1089
1090	# Convert a PostScript document to text
1091	# note - just using "ps2ascii" isn't good enough, as it
1092	# returns 0 for a postscript interpreter error. ps2ascii is just
1093	# a wrapper to "gs" anyway, so we use that cmd here.
1094
1095	sub ps_to_text {
1096	my ($input_filename, $output_filestem) = @_;
1097
1098	my $error = "";
1099
1100	# if we're on windows we'll fall straight through without attempting
1101	# to use gs
1102	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1103	$error = "Windows does not support gs";
1104
1105	} else {
1106	my $cmd = "";
1107	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1108	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1109	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1110	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1111	$cmd .= " 2> $output_filestem.err";
1112	$!=0;
1113
1114	my $retcode=system($cmd);
1115	$retcode = $? >> 8; # see man perlfunc - system for this...
1116	# if system returns -1 \| 127 (couldn't start program), look at $! for message
1117
1118	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1119	elsif (! -e "$output_filestem.text") {
1120	$error="did not create output file.\n";
1121	}
1122	else
1123	{ # make sure the interpreter didn't get an error. It is technically
1124	# possible for the actual text to start with this, but....
1125	open PSOUT, "$output_filestem.text";
1126	if (<PSOUT> =~ m/^Error: (.*)/) {
1127	$error="interpreter error - \"$1\"";
1128	}
1129	close PSOUT;
1130	}
1131	}
1132
1133	if ($error ne "")
1134	{
1135	print STDERR "Warning: Error executing gs: $error\n";
1136	print STDERR "Resorting to Perl regular expressions to extract text from PostScript...\n";
1137	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1138
1139	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1140	{
1141	print FAILLOG "gs - $error\n";
1142	if (-e "$output_filestem.err") {
1143	open(ERRLOG, "$output_filestem.err");
1144	while (<ERRLOG>) {print FAILLOG $_;}
1145	close ERRLOG;
1146	}
1147	close FAILLOG;
1148	}
1149	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1150
1151
1152	# Fine then. We'll just do a lousy job by ourselves...
1153	# Based on 5-line regexp sed script found at:
1154	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1155	#
1156	print STDERR "Stripping text from postscript\n";
1157	my $errorcode=0;
1158	open (IN, "$input_filename")
1159	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1160	open (OUT, ">$output_filestem.text")
1161	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1162	if ($errorcode) {print STDERR "errors\n";return 0;}
1163
1164	my $text=""; # this is for whole .ps file...
1165	$text = join('', <IN>); # see man perlport, under "System Resources"
1166	close IN;
1167
1168	# Make sure this is a ps file...
1169	if ($text !~ m/^%!/) {
1170	print STDERR "Bad postscript header: not '%!'\n";
1171	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1172	{
1173	print FAILLOG "Bad postscript header: not '%!'\n";
1174	close FAILLOG;
1175	}
1176	return 0;
1177	}
1178
1179	# if ps has Page data, then use it to delete all stuff before it.
1180	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1181
1182	# remove all leading non-data stuff
1183	$text =~ s/^.*?\(//s;
1184
1185	# remove all newline chars for easier processing
1186	$text =~ s/\n//g;
1187
1188	# Big assumption here - assume that if any co-ordinates are
1189	# given, then we are at the end of a sentence.
1190	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1191
1192	# special characters--
1193	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1194
1195	# ? ps text formatting (eg italics?) ?
1196	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1197	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1198	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1199	# default - remove the rest
1200	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1201
1202	# attempt to add whitespace between words...
1203	# this is based purely on observation, and may be completely wrong...
1204	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1205	# eg I notice "b(" is sometimes NOT a space if preceded by a
1206	# negative number.
1207	$text =~ s/\)\d+ ?b\(/\) \( /g;
1208
1209	# change quoted braces to brackets
1210	$text =~ s/([^\\])\\\(/$1\{/g;
1211	$text =~ s/([^\\])\\\)/$1\}/g ;
1212
1213	# remove everything that is not between braces
1214	$text =~ s/\)([^\(\)])+?\(//sg ;
1215
1216	# remove any Trailer eof stuff.
1217	$text =~ s/\)[^\)]*$//sg;
1218
1219	### ligatures have special characters...
1220	$text =~ s/\\013/ff/g;
1221	$text =~ s/\\014/fi/g;
1222	$text =~ s/\\015/fl/g;
1223	$text =~ s/\\016/ffi/g;
1224	$text =~ s/\\214/fi/g;
1225	$text =~ s/\\215/fl/g;
1226	$text =~ s/\\017/\n\* /g; # asterisk?
1227	$text =~ s/\\023/\023/g; # e acute ('e)
1228	$text =~ s/\\177/\252/g; # u"
1229	# $text =~ s/ ?? /\344/g; # a"
1230
1231	print OUT "$text";
1232	close OUT;
1233	}
1234	# wrap the text - use a minimum length. ie, first space after this length.
1235	my $wrap_length=72;
1236	&FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1237	open INFILE, "$output_filestem.text.tmp" \|\|
1238	die "Couldn't open file: $!";
1239	open OUTFILE, ">$output_filestem.text" \|\|
1240	die "Couldn't open file for writing: $!";
1241	my $line="";
1242	while ($line=<INFILE>) {
1243	while (length($line)>0) {
1244	if (length($line)>$wrap_length) {
1245	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1246	print OUTFILE "$1\n";
1247	} else {
1248	print OUTFILE "$line";
1249	$line="";
1250	}
1251	}
1252	}
1253	close INFILE;
1254	close OUTFILE;
1255	&FileUtils::removeFiles("$output_filestem.text.tmp");
1256
1257	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1258	return 1;
1259	}
1260
1261
1262	# Convert any file to HTML with a crude perl implementation of the
1263	# UNIX strings command.
1264
1265	sub any_to_html {
1266	my ($input_filename, $output_filestem) = @_;
1267
1268	# First generate a text file
1269	return 0 unless (&any_to_text($input_filename, $output_filestem));
1270
1271	# create an HTML file from the text file
1272	open(TEXT, "<$output_filestem.text");
1273	open(HTML, ">$output_filestem.html");
1274
1275	print HTML "<html><head>\n";
1276	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1277	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1278	print HTML "</head><body>\n\n";
1279
1280	my $line;
1281	while ($line=<TEXT>) {
1282	$line =~ s/</</g;
1283	$line =~ s/>/>/g;
1284	if ($line =~ m/^\s*$/) {
1285	print HTML "<p>";
1286	} else {
1287	print HTML "<br> ", $line;
1288	}
1289	}
1290	print HTML "\n</body></html>\n";
1291
1292	close HTML;
1293	close TEXT;
1294
1295	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1296	return 1;
1297	}
1298
1299	# Convert any file to TEXT with a crude perl implementation of the
1300	# UNIX strings command.
1301	# Note - this assumes ascii charsets :( (jrm21)
1302
1303	sub any_to_text {
1304	my ($input_filename, $output_filestem) = @_;
1305
1306	if (!$use_strings) {
1307	return 0;
1308	}
1309
1310	print STDERR "\n** In any to text**\n\n";
1311	open(IN, "<$input_filename") \|\| return 0;
1312	binmode(IN);
1313	open(OUT, ">$output_filestem.text") \|\| return 0;
1314
1315	my ($line);
1316	my $output_line_count = 0;
1317	while (<IN>) {
1318	$line = $_;
1319
1320	# delete anything that isn't a printable character
1321	$line =~ s/[^\040-\176]+/\n/sg;
1322
1323	# delete any string less than 10 characters long
1324	$line =~ s/^.{0,9}$/\n/mg;
1325	while ($line =~ m/^.{1,9}$/m) {
1326	$line =~ s/^.{0,9}$/\n/mg;
1327	$line =~ s/\n+/\n/sg;
1328	}
1329
1330	# remove extraneous whitespace
1331	$line =~ s/\n+/\n/gs;
1332	$line =~ s/^\n//gs;
1333
1334	# output whatever is left
1335	if ($line =~ m/[^\n ]/) {
1336	print OUT $line;
1337	++$output_line_count;
1338	}
1339	}
1340
1341	close OUT;
1342	close IN;
1343
1344	if ($output_line_count) { # try to protect against binary only formats
1345	return 1;
1346	}
1347
1348	&FileUtils::removeFiles("$output_filestem.text");
1349	return 0;
1350
1351	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: