Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 23473

Last change on this file since 23473 was 23473, checked in by ak19, 13 years ago
Provision for supporting .docx and .pptx files when Windows scripting is on.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 45.2 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use strict;
52
53	use parsargv;
54	use util;
55	use Cwd;
56	use File::Basename;
57
58	# Are we running on WinNT or Win2000 (or later)?
59	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62	my $use_strings;
63	my $pdf_complex;
64	my $pdf_nohidden;
65	my $pdf_zoom;
66	my $pdf_ignore_images;
67	my $pdf_allow_images_only;
68	my $windows_scripting;
69
70	sub print_usage
71	{
72	print STDERR "\n";
73	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
74	print STDERR " or text using third-party programs.\n\n";
75	print STDERR " usage: $0 [options] filename\n";
76	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
77	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
78	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
79	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
80	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
81	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
82	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
83	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
84	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
85	print STDERR "\t\tconverting PDF to HTML\n";
86	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
87	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
88	print STDERR "\t\t-pdf_complex is set\n";
89	exit(1);
90	}
91
92	my $faillogfile="";
93	my $timeout=0;
94
95	sub main
96	{
97	my (@ARGV) = @_;
98	my ($input_type,$output_type,$verbose);
99
100	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
101	# is in use or not
102	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
103	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
104	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
105	# Currently only have VBA for Word and PPT(but no XLS)
106	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
107
108	my $type_re = $default_type_re;
109
110	foreach my $a (@ARGV) {
111	if ($a =~ m/^windows_scripting$/i) {
112	$type_re = $enhanced_type_re;
113	}
114	}
115
116	# read command-line arguments
117	if (!parsargv::parse(\@ARGV,
118	"type/$type_re/", \$input_type,
119	'/errlog/.*/', \$faillogfile,
120	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
121	'timeout/\d+/0',\$timeout,
122	'verbose/\d+/0', \$verbose,
123	'windows_scripting',\$windows_scripting,
124	'use_strings', \$use_strings,
125	'pdf_complex', \$pdf_complex,
126	'pdf_ignore_images', \$pdf_ignore_images,
127	'pdf_allow_images_only', \$pdf_allow_images_only,
128	'pdf_nohidden', \$pdf_nohidden,
129	'pdf_zoom/\d+/2', \$pdf_zoom
130	))
131	{
132	print_usage();
133	}
134
135	# Make sure the input file exists and can be opened for reading
136	if (scalar(@ARGV!=1)) {
137	print_usage();
138	}
139
140	my $input_filename = $ARGV[0];
141	if (!-r $input_filename) {
142	print STDERR "Error: unable to open $input_filename for reading\n";
143	exit(1);
144	}
145
146	# Deduce filenames
147	my ($tailname,$dirname,$suffix)
148	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
149	my $output_filestem = &util::filename_cat($dirname, "$tailname");
150
151	if ($input_type eq "")
152	{
153	$input_type = lc (substr($suffix,1,length($suffix)-1));
154	}
155
156	# Change to temporary working directory
157	my $stored_dir = cwd();
158	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
159
160	# Select convert utility
161	if (!defined $input_type) {
162	print STDERR "Error: No filename extension or input type defined\n";
163	exit(1);
164	}
165	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
166	print &convertDOC($input_filename, $output_filestem, $output_type);
167	print "\n";
168	}
169	elsif ($input_type eq "rtf") {
170	print &convertRTF($input_filename, $output_filestem, $output_type);
171	print "\n";
172	}
173	elsif ($input_type eq "pdf") {
174	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
175	print "\n";
176	}
177	elsif ($input_type eq "ps") {
178	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
179	print "\n";
180	}
181	elsif ($input_type =~ m/pptx?$/) {
182	print &convertPPT($input_filename, $output_filestem, $output_type);
183	print "\n";
184	}
185	elsif ($input_type =~ m/xlsx?$/) {
186	print &convertXLS($input_filename, $output_filestem, $output_type);
187	print "\n";
188	}
189	else {
190	print STDERR "Error: Unable to convert type '$input_type'\n";
191	exit(1);
192	}
193
194	# restore to original working directory
195	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
196
197	}
198
199	&main(@ARGV);
200
201
202
203	# Document-type conversion functions
204	#
205	# The following functions attempt to convert documents from their
206	# input type to the specified output type. If no output type was
207	# given, then they first attempt HTML, and then TEXT.
208	#
209	# Each returns the output type ("html" or "text") or "fail" if no
210	# conversion is possible.
211
212	# Convert a Microsoft word document
213
214	sub convertDOC {
215	my ($input_filename, $output_filestem, $output_type) = @_;
216
217	# Many .doc files are not in fact word documents!
218	my $realtype = &find_docfile_type($input_filename);
219
220	if ($realtype eq "word6" \|\| $realtype eq "word7"
221	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
222	return &convertWord678($input_filename, $output_filestem, $output_type);
223	} elsif ($realtype eq "rtf") {
224	return &convertRTF($input_filename, $output_filestem, $output_type);
225	} else {
226	return &convertAnything($input_filename, $output_filestem, $output_type);
227	}
228	}
229
230	# Convert a Microsoft word 6/7/8 document
231
232	sub convertWord678 {
233	my ($input_filename, $output_filestem, $output_type) = @_;
234
235	my $success = 0;
236	if (!$output_type \|\| ($output_type =~ m/html/i)){
237	if ($windows_scripting) {
238	$success = &native_doc_to_html($input_filename, $output_filestem);
239	}
240	else {
241	$success = &doc_to_html($input_filename, $output_filestem);
242	}
243	if ($success) {
244	return "html";
245	}
246	}
247	return &convertAnything($input_filename, $output_filestem, $output_type);
248	}
249
250
251	# Convert a Rich Text Format (RTF) file
252
253	sub convertRTF {
254	my ($input_filename, $output_filestem, $output_type) = @_;
255
256	my $success = 0;
257
258	# Attempt specialised conversion to HTML
259	if (!$output_type \|\| ($output_type =~ m/html/i)) {
260
261	if ($windows_scripting) {
262	$success = &native_doc_to_html($input_filename, $output_filestem);
263	}
264	else {
265	$success = &rtf_to_html($input_filename, $output_filestem);
266	}
267	if ($success) {
268	return "html";
269	}
270	}
271
272	# rtf is so ugly that's it's not worth running strings over.
273	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
274	# return &convertAnything($input_filename, $output_filestem, $output_type);
275	return "fail";
276	}
277
278
279	# Convert an unidentified file
280
281	sub convertAnything {
282	my ($input_filename, $output_filestem, $output_type) = @_;
283
284	my $success = 0;
285
286	# Attempt simple conversion to HTML
287	if (!$output_type \|\| ($output_type =~ m/html/i)) {
288	$success = &any_to_html($input_filename, $output_filestem);
289	if ($success) {
290	return "html";
291	}
292	}
293
294	# Convert to text
295	if (!$output_type \|\| ($output_type =~ m/text/i)) {
296	$success = &any_to_text($input_filename, $output_filestem);
297	if ($success) {
298	return "text";
299	}
300	}
301	return "fail";
302	}
303
304
305
306	# Convert an Adobe PDF document
307
308	sub convertPDF {
309	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
310
311	my $success = 0;
312	$output_type =~ s/.\-(.)/$1/i;
313	# Attempt coversion to Image
314	if ($output_type =~ m/jp?g\|gif\|png/i) {
315	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
316	if ($success){
317	return "item";
318	}
319	}
320
321	# Attempt conversion to HTML
322	if (!$output_type \|\| ($output_type =~ m/html/i)) {
323	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
324	if ($success) {
325	return "html";
326	}
327	}
328
329	# Attempt conversion to TEXT
330	if (!$output_type \|\| ($output_type =~ m/text/i)) {
331	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
332	if ($success) {
333	return "text";
334	}
335	}
336
337	return "fail";
338
339	}
340
341
342	# Convert an Adobe PostScript document
343
344	sub convertPS {
345	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
346
347	my $success = 0;
348	$output_type =~ s/.\-(.)/$1/i;
349	# Attempt coversion to Image
350	if ($output_type =~ m/jp?g\|gif\|png/i) {
351	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
352	if ($success){
353	return "item";
354	}
355	}
356
357	# Attempt conversion to TEXT
358	if (!$output_type \|\| ($output_type =~ m/text/i)) {
359	$success = &ps_to_text($input_filename, $output_filestem);
360	if ($success) {
361	return "text";
362	}
363	}
364	return "fail";
365	}
366
367
368	sub convertPPT {
369	my ($input_filename, $output_filestem, $output_type) = @_;
370	my $success = 0;
371
372	my $ppt_convert_type = "";
373
374	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
375	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
376	if ($output_type =~ m/gif/i) {
377	$ppt_convert_type = "-g";
378	} elsif ($output_type =~ m/jp?g/i){
379	$ppt_convert_type = "-j";
380	} elsif ($output_type =~ m/png/i){
381	$ppt_convert_type = "-p";
382	}
383	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
384	$ENV{'GSDLOS'}, "pptextract");
385	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
386
387	my $cmd = "";
388	if ($timeout) {$cmd = "ulimit -t $timeout;";}
389	# if the converting directory already exists
390	if (-d $output_filestem) {
391	print STDERR "**The conversion directory already exists\n";
392	return "item";
393	} else {
394	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
395	$cmd .= " 2>\"$output_filestem.err\""
396	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
397	if (system($cmd) !=0) {
398	print STDERR "Powerpoint VB Scripting convert failed\n";
399	} else {
400	return "item";
401	}
402	}
403	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
404	# Attempt conversion to HTML
405	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
406	# formulate the command
407	my $cmd = "";
408	$cmd .= "perl -S ppttohtml.pl ";
409	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
410	$cmd .= " 2>\"$output_filestem.err\""
411	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
412
413	# execute the command
414	$!=0;
415	if (system($cmd)!=0)
416	{
417	print STDERR "Powerpoint 95/97 converter failed $!\n";
418	} else {
419	return "html";
420	}
421	}
422
423	$success = &any_to_text($input_filename, $output_filestem);
424	if ($success) {
425	return "text";
426	}
427
428	return "fail";
429	}
430
431
432	sub convertXLS {
433	my ($input_filename, $output_filestem, $output_type) = @_;
434
435	my $success = 0;
436
437	# Attempt conversion to HTML
438	if (!$output_type \|\| ($output_type =~ m/html/i)) {
439	# formulate the command
440	my $cmd = "";
441	$cmd .= "perl -S xlstohtml.pl ";
442	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
443	$cmd .= " 2>\"$output_filestem.err\""
444	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
445
446
447	# execute the command
448	$!=0;
449	if (system($cmd)!=0)
450	{
451	print STDERR "Excel 95/97 converter failed $!\n";
452	} else {
453	return "html";
454	}
455	}
456
457	$success = &any_to_text($input_filename, $output_filestem);
458	if ($success) {
459	return "text";
460	}
461
462	return "fail";
463	}
464
465
466
467	# Find the real type of a .doc file
468	#
469	# We seem to have a lot of files with a .doc extension that are .rtf
470	# files or Word 5 files. This function attempts to tell the difference.
471	sub find_docfile_type {
472	my ($input_filename) = @_;
473
474	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
475	return "docx";
476	}
477
478	open(CHK, "<$input_filename");
479	binmode(CHK);
480	my $line = "";
481	my $first = 1;
482
483	while (<CHK>) {
484
485	$line = $_;
486
487	if ($first) {
488	# check to see if this is an rtf file
489	if ($line =~ m/^\{\\rtf/) {
490	close(CHK);
491	return "rtf";
492	}
493	$first = 0;
494	}
495
496	# is this is a word 6/7/8 document?
497	if ($line =~ m/Word\.Document\.([678])/) {
498	close(CHK);
499
500	return "word$1";
501	}
502
503	}
504
505	return "unknown";
506	}
507
508
509	# Specific type-to-type conversions
510	#
511	# Each of the following functions attempts to convert a document from
512	# a specific format to another. If they succeed they return 1 and leave
513	# the output document(s) in the appropriate place; if they fail they
514	# return 0 and delete any working files.
515
516
517	# Attempt to convert a word document to html with the wv program
518	sub doc_to_html {
519	my ($input_filename, $output_filestem) = @_;
520
521	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
522
523	if ( -d "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv" && $ENV{'GSDLOS'} eq "linux" ) {
524	$ENV{'PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/bin:$ENV{'PATH'}";
525	$ENV{'LD_LIBRARY_PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/lib:$ENV{'LD_LIBRARY_PATH'}";
526	$wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv", "bin", "wvWare");
527	}
528
529	# don't include path on windows (to avoid having to play about
530	# with quoting when GSDLHOME might contain spaces) but assume
531	# that the PATH is set up correctly
532	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
533
534	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
535	"packages", "wv", "wvHtml.xml");
536
537	# Added the following to work with replace_srcdoc_with_html.pl:
538	# Make wvWare put any associated (image) files of the word doc into
539	# folder docname-without-extention_files. This folder should be at
540	# the same level as the html file generated from the doc.
541	# wvWare will take care of proper interlinking.
542
543	# This step is necessary for replace_srcdoc_with_html.pl which will
544	# move the html and associated files into the import folder. We
545	# want to ensure that the associated files won't overwrite similarly
546	# named items already in import. Hence we put them in a folder first
547	# (to which the html links properly) and that will allow
548	# replace_srcdoc_with_html.pl to move them safely to /import.
549
550	# To do all this, we need to use wvWare's --dir and --basename options
551	# where dir is the full path to the image folder directory and
552	# basename is the full path to the image folder appended to the name
553	# which is to be prepended to every image file:
554	# eg. if the images were to have names like sample0.jpg to sampleN.jpg,
555	# then the basename is "/full/path/to/imgdir/sample".
556	# In this case, basename is the full path to and name of the document.
557	# HOWEVER: basename always takes full path, not relative url, so
558	# the greenstone browser is unable to display the images (absolute paths
559	# cause it to give an "external link" message)
560	# See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
561	# and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
562	# "added --dir option to wvHtml so that pictures can be placed in
563	# a seperate directory"
564	# "running wvWare through IMP to view word documents as html. It gets
565	# invoked like this:
566	# wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
567
568	# toppath is the folder where html is generated
569	# docname is the name (without extension) of the html to be generated
570	# suffix (extension) is thrown away
571	my ($docname, $toppath)
572	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
573
574	# We want the image folder generated to have the same name as windows
575	# would generate ($windows_scripting) when it converts from word to html.
576	# That is, foldername=docname_files
577	my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
578	#print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
579
580	# ensure this image directory exists
581	# if it exists already, just delete and recreate
582	if(-e $assoc_dir) {
583	&util::rm_r($assoc_dir);
584	}
585	&util::mk_dir($assoc_dir);
586
587	# the images are all going to be called image0, image1,..., imageN
588	my $img_basenames = &util::filename_cat($assoc_dir, $docname);
589
590	#print STDERR "**toppath: $toppath\n**docname: $docname\n;
591	#print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
592	#print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
593
594	my $cmd = "";
595	if ($timeout) {$cmd = "ulimit -t $timeout;";}
596	# wvWare's --dir and --basename options for image directory.
597	# Replaced the next line with the 2 lines following it:
598	# $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
599	$cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
600	$cmd .= " --charset utf-8 --config \"$wv_conf\"";
601	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
602
603	# redirecting STDERR is a bad idea on windows 95/98
604	$cmd .= " 2> \"$output_filestem.err\""
605	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
606	# execute the command
607	$!=0;
608	if (system($cmd)!=0)
609	{
610	print STDERR "Error executing wv converter:$!\n";
611	if (-s "$output_filestem.err") {
612	open (ERRFILE, "<$output_filestem.err");
613
614	my $write_to_fail_log=0;
615	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
616	{$write_to_fail_log=1;}
617
618	my $line;
619	while ($line=<ERRFILE>) {
620	if ($line =~ m/\w/) {
621	print STDERR "$line";
622	print FAILLOG "$line" if ($write_to_fail_log);
623	}
624	if ($line !~ m/startup error/) {next;}
625	print STDERR " (given an invalid .DOC file?)\n";
626	print FAILLOG " (given an invalid .DOC file?)\n"
627	if ($write_to_fail_log);
628
629	} # while ERRFILE
630	close FAILLOG if ($write_to_fail_log);
631	}
632	return 0; # we can try any_to_text
633	}
634
635	# Was the conversion successful?
636
637	if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
638	open(TMP, "$output_filestem.html");
639	my $line = <TMP>;
640	close(TMP);
641	if ($line && $line =~ m/DOCTYPE HTML/) {
642	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
643
644	# Inserted this code to remove the images directory if it was still empty after
645	# the html was generated (in case there were no images in the word document)
646	if (&util::is_dir_empty($assoc_dir)) {
647	#print STDERR "*gsConvert.pl: Image dir $assoc_dir is empty, removing*\n";
648	&util::rm_r($assoc_dir);
649	} else { # there was an image folder (it was generated)
650	# Therefore, the html file generated contains absolute links to the images
651	# Replace them with relative links instead, so the folder can be moved elsewhere
652	&make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
653	}
654	return 1;
655	}
656	}
657
658	# If here, an error of some sort occurred
659	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
660	if (-e "$output_filestem.err") {
661	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
662	open (ERRLOG,"$output_filestem.err");
663	while (<ERRLOG>) {print FAILLOG $_;}
664	close FAILLOG;
665	close ERRLOG;
666	}
667	&util::rm("$output_filestem.err");
668	}
669
670	return 0;
671	}
672
673	# Method to work with doc_to_html - Word docs might contain images.
674	# When such word docs are converted with wvWare, we make it generate a
675	# <filename>_files folder with the associated images, while the html file
676	# <filename> refers to the images using absolute paths to <filename>_files.
677	# This method reads in that html file and replaces all the absolute paths to
678	# the images in <filename>_files with the relative paths to the images from
679	# that folder. (I.e. with <filename>_files/<imagename.ext>).
680	sub make_links_to_assocdir_relative{
681	# toppath is the top-level folder in which the html file we're going to be fixing resides
682	# docname is just the name (without extension) of the html file
683	# html_file is the full path to the html file: /full/path/docname.html
684	# assoc_dir_path is toppath/docname_files
685	# assoc_dirname is the directory name of the folder with associated imgs: docname_files
686	my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
687
688	# 1. Read all the contents of the html into a string
689	# open the original file for reading
690	unless(open(FIN, "<$html_file")) {
691	print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
692	return 0;
693	}
694	# From http://perl.plover.com/local.html
695	# "It's cheaper to read the file all at once, without all the splitting and reassembling.
696	# (Some people call this slurping the file.) Perl has a special feature to support this:
697	# If the $/ variable is undefined, the <...> operator will read the entire file all at once"
698	my $html_contents;
699	{
700	local $/ = undef; # Read entire file at once
701	$html_contents = <FIN>; # Now file is read in as one single 'line'
702	}
703	close(FIN); # close the file
704	#print STDERR $html_contents;
705
706	# 2. Replace (substitute) all ocurrences of the assoc_dir_path in a hrefs and img src
707	# values with assoc_dirname
708	# At the end: g means substitute all occurrences (global), while s at the end means treat
709	# all new lines as a regular space. This interacts with g to consider all the lines
710	# together as a single line so that multi-occurrences can be replaced.
711
712	# we can't just replace $assoc_dir_path with $assoc_dir
713	# $assoc_dir_path represents a regular expression that needs to be replaced
714	# if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
715	# meaning in Perl regular expressions -- we need to escape these first
716	my $safe_reg_expression = $assoc_dir_path;
717	$safe_reg_expression =~ s/\\/\\\\/g;
718	$safe_reg_expression =~ s/\./\\./g;
719	$safe_reg_expression =~ s/\-/\\-/g;
720	$safe_reg_expression =~ s/\[/\\[/g;
721	$safe_reg_expression =~ s/\]/\\]/g;
722	$safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
723
724	# The following regular expression substitution looks for <a or <image, followed by any other
725	# attributes and values until it comes to the FIRST (indicated by ?) href= or src=
726	# followed by " or ' no quotes at all around path, followed by the associated folder's pathname
727	# followed by characters (for the img filename), then finally the optional closing quotes
728	# in " or ' form, followed by any other attributes and values until the first > to end the tag.
729	# The substitution: all the parts preceding associated folder's pathname are retained,
730	# the associated folder path name is replaced by associated folder directory name
731	# and the rest upto and including the closing > tag is retained.
732	# The sg at the end of the pattern match treats all of html_contents as a single line (s)
733	# and performs a global replace (g) meaning that all occurrences that match in that single line
734	# are substituted.
735	$html_contents =~ s/(<(a\|img).?(href\|src)=(\"\|\')?)$safe_reg_expression(.?(\"\|\')?.*?>)/$1$assoc_dirname$5/sg;
736	#$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
737	# now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
738	$html_contents =~ s/(<(a\|img).?(href\|src)=(\"\|\')?)(.)(.?(\"\|\')?.?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
739
740	#print STDERR "**assoc_dirname: $assoc_dirname*\n";
741	#print STDERR "**safe_reg_expression: $safe_reg_expression*\n";
742
743	# delete the original file and recreate it
744	my $copy_of_filename = $html_file;
745	&util::rm($copy_of_filename); # deleted the file
746
747	# Recreate the original file for writing the updated contents
748	unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
749	print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
750	return 0;
751	}
752
753	# write out the updated contents and close the file
754	print FOUT $html_contents;
755	close(FOUT);
756	return 1;
757	}
758
759	# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
760	# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
761	# introduced in link pathnames by wvWare into space again. Converts all percent signs
762	# introduced by URL encoding filenames generated into %25 in these url links referencing them
763	sub post_process_assocfile_urls
764	{
765	my ($pre, $text, $post) = @_;
766
767	$text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
768	# $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
769	$text =~ s/\\/\//g;
770	$text =~ s/%/%25/g;
771
772	return "$pre$text$post";
773	}
774
775	# Attempt to convert a word document to html with the word2html scripting program
776	sub native_doc_to_html {
777	my ($input_filename, $output_filestem) = @_;
778
779	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
780	$ENV{'GSDLOS'}, "word2html");
781
782	$vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
783	if (-e "$output_filestem.html") {
784	print STDERR " The conversion file:\n";
785	print STDERR " $output_filestem.html\n";
786	print STDERR " ... already exists. Skipping\n";
787	return 1;
788	}
789
790	my $cmd = "";
791	if ($timeout) {$cmd = "ulimit -t $timeout;";}
792	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
793	#$cmd .= "$vbScript $input_filename $output_filestem.html";
794	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
795
796	# redirecting STDERR
797	$cmd .= " 2> \"$output_filestem.err\""
798	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
799
800	# execute the command
801	$!=0;
802	if (system($cmd)!=0)
803	{
804	print STDERR "Error executing word2Html converter:$!\n";
805	if (-s "$output_filestem.err") {
806	open (ERRFILE, "<$output_filestem.err");
807
808	my $write_to_fail_log=0;
809	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
810	{$write_to_fail_log=1;}
811
812	my $line;
813	while ($line=<ERRFILE>) {
814	if ($line =~ m/\w/) {
815	print STDERR "$line";
816	print FAILLOG "$line" if ($write_to_fail_log);
817	}
818	if ($line !~ m/startup error/) {next;}
819	print STDERR " (given an invalid .DOC file?)\n";
820	print FAILLOG " (given an invalid .DOC file?)\n"
821	if ($write_to_fail_log);
822
823	} # while ERRFILE
824	close FAILLOG if ($write_to_fail_log);
825	}
826	return 0; # we can try any_to_text
827	}
828
829	# Was the conversion successful?
830	if (-s "$output_filestem.html") {
831	open(TMP, "$output_filestem.html");
832	my $line = <TMP>;
833	close(TMP);
834	if ($line && $line =~ m/html/i) {
835	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
836	return 1;
837	}
838	}
839
840	# If here, an error of some sort occurred
841	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
842	if (-e "$output_filestem.err") {
843	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
844	open (ERRLOG,"$output_filestem.err");
845	while (<ERRLOG>) {print FAILLOG $_;}
846	close FAILLOG;
847	close ERRLOG;
848	}
849	&util::rm("$output_filestem.err");
850	}
851	return 0;
852	}
853
854	# Attempt to convert an RTF document to html with rtftohtml
855	sub rtf_to_html {
856	my ($input_filename, $output_filestem) = @_;
857
858	# formulate the command
859	my $cmd = "";
860	if ($timeout) {$cmd = "ulimit -t $timeout;";}
861	$cmd .= "rtftohtml";
862	#$cmd .= "rtf-converter";
863
864	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
865
866	$cmd .= " 2>\"$output_filestem.err\""
867	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
868
869
870	# execute the command
871	$!=0;
872	if (system($cmd)!=0)
873	{
874	print STDERR "Error executing rtf converter $!\n";
875	# don't currently bother printing out error log...
876	# keep going, in case it still created an HTML file...
877	}
878
879	# Was the conversion successful?
880	my $was_successful=0;
881	if (-s "$output_filestem.html") {
882	# make sure we have some content other than header
883	open (HTML, "$output_filestem.html"); # what to do if fail?
884	my $line;
885	my $past_header=0;
886	while ($line=<HTML>) {
887
888	if ($past_header == 0) {
889	if ($line =~ m/<body>/) {$past_header=1;}
890	next;
891	}
892
893	$line =~ s/<[^>]+>//g;
894	if ($line =~ m/\w/ && $past_header) { # we found some content...
895	$was_successful=1;
896	last;
897	}
898	}
899	close HTML;
900	}
901
902	if ($was_successful) {
903	&util::rm("$output_filestem.err")
904	if (-e "$output_filestem.err");
905	# insert the (modified) table of contents, if it exists.
906	if (-e "${output_filestem}_ToC.html") {
907	&util::mv("$output_filestem.html","$output_filestem.src");
908	my $open_failed=0;
909	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
910	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
911	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
912
913	if ($open_failed) {
914	close HTMLSRC;
915	close TOC;
916	close HTML;
917	&util::mv("$output_filestem.src","$output_filestem.html");
918	return 1;
919	}
920
921	# print out header info from src html.
922	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
923	print HTML "$_";
924	}
925
926	# print out table of contents, making links relative
927	<TOC>; <TOC>; # ignore first 2 lines
928	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
929	my $line;
930	while ($line=<TOC>) {
931	$line =~ s@</body></html>$@@i ; # only last line has this
932	# make link relative
933	$line =~ s@href=\"[^\#]+@href=\"@i;
934	print HTML $line;
935	}
936	close TOC;
937
938	# rest of html src
939	while (<HTMLSRC>) {
940	print HTML $_;
941	}
942	close HTMLSRC;
943	close HTML;
944
945	&util::rm("${output_filestem}_ToC.html");
946	&util::rm("${output_filestem}.src");
947	}
948	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
949	return 1; # success
950	}
951
952	if (-e "$output_filestem.err") {
953	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
954	{
955	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
956	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
957	print FAILLOG " (rtf file might be too recent):\n";
958	open (ERRLOG, "$output_filestem.err");
959	while (<ERRLOG>) {print FAILLOG $_;}
960	close ERRLOG;
961	close FAILLOG;
962	}
963	&util::rm("$output_filestem.err");
964	}
965
966	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
967
968	return 0;
969	}
970
971
972	# Convert a pdf file to html with the pdftohtml command
973
974	sub pdf_to_html {
975	my ($dirname, $input_filename, $output_filestem) = @_;
976
977	my $cmd = "";
978	if ($timeout) {$cmd = "ulimit -t $timeout;";}
979	$cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
980	$cmd .= " -c" if ($pdf_complex);
981	$cmd .= " -i" if ($pdf_ignore_images);
982	$cmd .= " -a" if ($pdf_allow_images_only);
983	$cmd .= " -hidden" unless ($pdf_nohidden);
984	$cmd .= " \"$input_filename\" \"$output_filestem\"";
985
986	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
987	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
988	} else {
989	$cmd .= " > \"$output_filestem.err\"";
990	}
991
992	$!=0;
993
994	my $retval=system($cmd);
995	if ($retval!=0)
996	{
997	print STDERR "Error executing pdftohtml.pl";
998	if ($!) {print STDERR ": $!";}
999	print STDERR "\n";
1000	}
1001
1002	# make sure the converter made something
1003	if ($retval!=0 \|\| ! -s "$output_filestem.html")
1004	{
1005	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1006	# print out the converter's std err, if any
1007	if (-s "$output_filestem.err") {
1008	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1009	print STDERR "pdftohtml error log:\n";
1010	while (<ERRLOG>) {
1011	print STDERR "$_";
1012	}
1013	close ERRLOG;
1014	}
1015	print STDERR "***********output filestem $output_filestem.html\n";
1016	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1017	if (-e "$output_filestem.err") {
1018	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1019	{
1020	open (ERRLOG, "$output_filestem.err");
1021	while (<ERRLOG>) {print FAILLOG $_;}
1022	close ERRLOG;
1023	close FAILLOG;
1024	}
1025	&util::rm("$output_filestem.err");
1026	}
1027	return 0;
1028	}
1029
1030	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1031	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1032	return 1;
1033	}
1034
1035	# Convert a pdf file to various types of image with the convert command
1036
1037	sub pdfps_to_img {
1038	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
1039
1040	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
1041	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
1042	my $result = `identify 2>&1`;
1043	if ($? == -1 \|\| $? == 256) { # Linux and Windows return different values for "program not found"
1044	#ImageMagick is not installed, thus the convert utility is not available.
1045	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
1046	return 0;
1047	}
1048	}
1049
1050	my $cmd = "";
1051	if ($timeout) {$cmd = "ulimit -t $timeout;";}
1052	$output_type =~ s/.\_(.)/$1/i;
1053	$cmd .= "perl -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
1054	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
1055	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1056	} else {
1057	$cmd .= " > \"$output_filestem.err\"";
1058	}
1059
1060	# don't include path on windows (to avoid having to play about
1061	# with quoting when GSDLHOME might contain spaces) but assume
1062	# that the PATH is set up correctly
1063	$!=0;
1064	my $retval=system($cmd);
1065	if ($retval!=0)
1066	{
1067	print STDERR "Error executing pdftoimg.pl";
1068	if ($!) {print STDERR ": $!";}
1069	print STDERR "\n";
1070	}
1071
1072	#make sure the converter made something
1073	#if ($retval !=0) \|\| ! -s "$output_filestem")
1074	if ($retval !=0)
1075	{
1076	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1077	#print out the converter's std err, if any
1078	if (-s "$output_filestem.err") {
1079	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1080	print STDERR "pdfpstoimg error log:\n";
1081	while (<ERRLOG>) {
1082	print STDERR "$_";
1083	}
1084	close ERRLOG;
1085	}
1086	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1087	if (-e "$output_filestem.err") {
1088	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1089	{
1090	open (ERRLOG, "$output_filestem.err");
1091	while (<ERRLOG>) {print FAILLOG $_;}
1092	close ERRLOG;
1093	close FAILLOG;
1094	}
1095	&util::rm("$output_filestem.err");
1096	}
1097	return 0;
1098	}
1099	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1100	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1101	return 1;
1102	}
1103
1104	# Convert a PDF file to text with the pdftotext command
1105
1106	sub pdf_to_text {
1107	my ($dirname, $input_filename, $output_filestem) = @_;
1108
1109	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1110
1111	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1112	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1113	} else {
1114	$cmd .= " > \"$output_filestem.err\"";
1115	}
1116
1117	if (system($cmd)!=0)
1118	{
1119	print STDERR "Error executing $cmd: $!\n";
1120	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1121	}
1122
1123	# make sure there is some extracted text.
1124	if (-e "$output_filestem.text") {
1125	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
1126	binmode(EXTR_TEXT); # just in case...
1127	my $line="";
1128	my $seen_text=0;
1129	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1130	if ($line=~ m/\w/) {$seen_text=1;}
1131	}
1132	close EXTR_TEXT;
1133	if ($seen_text==0) { # no text was extracted
1134	print STDERR "Error: pdftotext found no text\n";
1135	&util::rm("$output_filestem.text");
1136	}
1137	}
1138
1139	# make sure the converter made something
1140	if (! -s "$output_filestem.text")
1141	{
1142	# print out the converters std err, if any
1143	if (-s "$output_filestem.err") {
1144	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1145	print STDERR "pdftotext error log:\n";
1146	while (<ERRLOG>) {
1147	print STDERR "$_";
1148	}
1149	close ERRLOG;
1150	}
1151	# does this converter create a .out file?
1152	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1153	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1154	if (-e "$output_filestem.err") {
1155	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1156	{
1157	open (ERRLOG,"$output_filestem.err");
1158	while (<ERRLOG>) {print FAILLOG $_;}
1159	close ERRLOG;
1160	close FAILLOG;
1161	}
1162	&util::rm("$output_filestem.err");
1163	}
1164	return 0;
1165	}
1166	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1167	return 1;
1168	}
1169
1170	# Convert a PostScript document to text
1171	# note - just using "ps2ascii" isn't good enough, as it
1172	# returns 0 for a postscript interpreter error. ps2ascii is just
1173	# a wrapper to "gs" anyway, so we use that cmd here.
1174
1175	sub ps_to_text {
1176	my ($input_filename, $output_filestem) = @_;
1177
1178	my $error = "";
1179
1180	# if we're on windows we'll fall straight through without attempting
1181	# to use gs
1182	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1183	$error = "Windows does not support gs";
1184
1185	} else {
1186	my $cmd = "";
1187	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1188	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1189	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1190	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1191	$cmd .= " 2> $output_filestem.err";
1192	$!=0;
1193
1194	my $retcode=system($cmd);
1195	$retcode = $? >> 8; # see man perlfunc - system for this...
1196	# if system returns -1 \| 127 (couldn't start program), look at $! for message
1197
1198	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1199	elsif (! -e "$output_filestem.text") {
1200	$error="did not create output file.\n";
1201	}
1202	else
1203	{ # make sure the interpreter didn't get an error. It is technically
1204	# possible for the actual text to start with this, but....
1205	open PSOUT, "$output_filestem.text";
1206	if (<PSOUT> =~ m/^Error: (.*)/) {
1207	$error="interpreter error - \"$1\"";
1208	}
1209	close PSOUT;
1210	}
1211	}
1212
1213	if ($error ne "")
1214	{
1215	print STDERR "Warning: Error executing gs: $error\n";
1216	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1217
1218	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1219	{
1220	print FAILLOG "gs - $error\n";
1221	if (-e "$output_filestem.err") {
1222	open(ERRLOG, "$output_filestem.err");
1223	while (<ERRLOG>) {print FAILLOG $_;}
1224	close ERRLOG;
1225	}
1226	close FAILLOG;
1227	}
1228	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1229
1230
1231	# Fine then. We'll just do a lousy job by ourselves...
1232	# Based on 5-line regexp sed script found at:
1233	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1234	#
1235	print STDERR "Stripping text from postscript\n";
1236	my $errorcode=0;
1237	open (IN, "$input_filename")
1238	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1239	open (OUT, ">$output_filestem.text")
1240	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1241	if ($errorcode) {print STDERR "errors\n";return 0;}
1242
1243	my $text=""; # this is for whole .ps file...
1244	$text = join('', <IN>); # see man perlport, under "System Resources"
1245	close IN;
1246
1247	# Make sure this is a ps file...
1248	if ($text !~ m/^%!/) {
1249	print STDERR "Bad postscript header: not '%!'\n";
1250	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1251	{
1252	print FAILLOG "Bad postscript header: not '%!'\n";
1253	close FAILLOG;
1254	}
1255	return 0;
1256	}
1257
1258	# if ps has Page data, then use it to delete all stuff before it.
1259	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1260
1261	# remove all leading non-data stuff
1262	$text =~ s/^.*?\(//s;
1263
1264	# remove all newline chars for easier processing
1265	$text =~ s/\n//g;
1266
1267	# Big assumption here - assume that if any co-ordinates are
1268	# given, then we are at the end of a sentence.
1269	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1270
1271	# special characters--
1272	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1273
1274	# ? ps text formatting (eg italics?) ?
1275	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1276	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1277	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1278	# default - remove the rest
1279	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1280
1281	# attempt to add whitespace between words...
1282	# this is based purely on observation, and may be completely wrong...
1283	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1284	# eg I notice "b(" is sometimes NOT a space if preceded by a
1285	# negative number.
1286	$text =~ s/\)\d+ ?b\(/\) \( /g;
1287
1288	# change quoted braces to brackets
1289	$text =~ s/([^\\])\\\(/$1\{/g;
1290	$text =~ s/([^\\])\\\)/$1\}/g ;
1291
1292	# remove everything that is not between braces
1293	$text =~ s/\)([^\(\)])+?\(//sg ;
1294
1295	# remove any Trailer eof stuff.
1296	$text =~ s/\)[^\)]*$//sg;
1297
1298	### ligatures have special characters...
1299	$text =~ s/\\013/ff/g;
1300	$text =~ s/\\014/fi/g;
1301	$text =~ s/\\015/fl/g;
1302	$text =~ s/\\016/ffi/g;
1303	$text =~ s/\\214/fi/g;
1304	$text =~ s/\\215/fl/g;
1305	$text =~ s/\\017/\n\* /g; # asterisk?
1306	$text =~ s/\\023/\023/g; # e acute ('e)
1307	$text =~ s/\\177/\252/g; # u"
1308	# $text =~ s/ ?? /\344/g; # a"
1309
1310	print OUT "$text";
1311	close OUT;
1312	}
1313	# wrap the text - use a minimum length. ie, first space after this length.
1314	my $wrap_length=72;
1315	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1316	open INFILE, "$output_filestem.text.tmp" \|\|
1317	die "Couldn't open file: $!";
1318	open OUTFILE, ">$output_filestem.text" \|\|
1319	die "Couldn't open file for writing: $!";
1320	my $line="";
1321	while ($line=<INFILE>) {
1322	while (length($line)>0) {
1323	if (length($line)>$wrap_length) {
1324	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1325	print OUTFILE "$1\n";
1326	} else {
1327	print OUTFILE "$line";
1328	$line="";
1329	}
1330	}
1331	}
1332	close INFILE;
1333	close OUTFILE;
1334	&util::rm("$output_filestem.text.tmp");
1335
1336	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1337	return 1;
1338	}
1339
1340
1341	# Convert any file to HTML with a crude perl implementation of the
1342	# UNIX strings command.
1343
1344	sub any_to_html {
1345	my ($input_filename, $output_filestem) = @_;
1346
1347	# First generate a text file
1348	return 0 unless (&any_to_text($input_filename, $output_filestem));
1349
1350	# create an HTML file from the text file
1351	open(TEXT, "<$output_filestem.text");
1352	open(HTML, ">$output_filestem.html");
1353
1354	print HTML "<html><head>\n";
1355	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1356	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1357	print HTML "</head><body>\n\n";
1358
1359	my $line;
1360	while ($line=<TEXT>) {
1361	$line =~ s/</</g;
1362	$line =~ s/>/>/g;
1363	if ($line =~ m/^\s*$/) {
1364	print HTML "<p>";
1365	} else {
1366	print HTML "<br> ", $line;
1367	}
1368	}
1369	print HTML "\n</body></html>\n";
1370
1371	close HTML;
1372	close TEXT;
1373
1374	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1375	return 1;
1376	}
1377
1378	# Convert any file to TEXT with a crude perl implementation of the
1379	# UNIX strings command.
1380	# Note - this assumes ascii charsets :( (jrm21)
1381
1382	sub any_to_text {
1383	my ($input_filename, $output_filestem) = @_;
1384
1385	if (!$use_strings) {
1386	return 0;
1387	}
1388
1389	print STDERR "\n** In any to text**\n\n";
1390	open(IN, "<$input_filename") \|\| return 0;
1391	binmode(IN);
1392	open(OUT, ">$output_filestem.text") \|\| return 0;
1393
1394	my ($line);
1395	my $output_line_count = 0;
1396	while (<IN>) {
1397	$line = $_;
1398
1399	# delete anything that isn't a printable character
1400	$line =~ s/[^\040-\176]+/\n/sg;
1401
1402	# delete any string less than 10 characters long
1403	$line =~ s/^.{0,9}$/\n/mg;
1404	while ($line =~ m/^.{1,9}$/m) {
1405	$line =~ s/^.{0,9}$/\n/mg;
1406	$line =~ s/\n+/\n/sg;
1407	}
1408
1409	# remove extraneous whitespace
1410	$line =~ s/\n+/\n/gs;
1411	$line =~ s/^\n//gs;
1412
1413	# output whatever is left
1414	if ($line =~ m/[^\n ]/) {
1415	print OUT $line;
1416	++$output_line_count;
1417	}
1418	}
1419
1420	close OUT;
1421	close IN;
1422
1423	if ($output_line_count) { # try to protect against binary only formats
1424	return 1;
1425	}
1426
1427	&util::rm("$output_filestem.text");
1428	return 0;
1429
1430	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: