Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 24166

Last change on this file since 24166 was 24166, checked in by ak19, 13 years ago
2nd and tentatively final set of changes changes to get the new docx2html functionality to work on docx files. The changes have to do with error reporting when Word is not installed/can't be found/can't be instantiated, when the script is launched with the wrong number of args and if the input file does not exist. WordPlugin now has docx as part of the default process_expression (even when OO is not installed).
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 46.7 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49
50	if(!$ENV{'PERLPATH'}) {
51	my $full_perl_exec = $^X;
52	require File::Basename;
53	my $perl_path = File::Basename::dirname($full_perl_exec);
54	$ENV{'PERLPATH'} = $perl_path;
55	}
56
57	}
58
59	use strict;
60
61	use parsargv;
62	use util;
63	use Cwd;
64
65	# Are we running on WinNT or Win2000 (or later)?
66	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
67	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
68
69	my $use_strings;
70	my $pdf_complex;
71	my $pdf_nohidden;
72	my $pdf_zoom;
73	my $pdf_ignore_images;
74	my $pdf_allow_images_only;
75	my $windows_scripting;
76
77	sub print_usage
78	{
79	print STDERR "\n";
80	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
81	print STDERR " or text using third-party programs.\n\n";
82	print STDERR " usage: $0 [options] filename\n";
83	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
84	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
85	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
86	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
87	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
88	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
89	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
90	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
91	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
92	print STDERR "\t\tconverting PDF to HTML\n";
93	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
94	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
95	print STDERR "\t\t-pdf_complex is set\n";
96	exit(1);
97	}
98
99	my $faillogfile="";
100	my $timeout=0;
101
102	sub main
103	{
104	my (@ARGV) = @_;
105	my ($input_type,$output_type,$verbose);
106
107	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
108	# is in use or not
109	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
110	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
111	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
112	# Currently only have VBA for Word and PPT(but no XLS)
113	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
114
115	my $type_re = $default_type_re;
116
117	foreach my $a (@ARGV) {
118	if ($a =~ m/^windows_scripting$/i) {
119	$type_re = $enhanced_type_re;
120	}
121	}
122
123	# read command-line arguments
124	if (!parsargv::parse(\@ARGV,
125	"type/$type_re/", \$input_type,
126	'/errlog/.*/', \$faillogfile,
127	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
128	'timeout/\d+/0',\$timeout,
129	'verbose/\d+/0', \$verbose,
130	'windows_scripting',\$windows_scripting,
131	'use_strings', \$use_strings,
132	'pdf_complex', \$pdf_complex,
133	'pdf_ignore_images', \$pdf_ignore_images,
134	'pdf_allow_images_only', \$pdf_allow_images_only,
135	'pdf_nohidden', \$pdf_nohidden,
136	'pdf_zoom/\d+/2', \$pdf_zoom
137	))
138	{
139	print_usage();
140	}
141
142	# Make sure the input file exists and can be opened for reading
143	if (scalar(@ARGV!=1)) {
144	print_usage();
145	}
146
147	my $input_filename = $ARGV[0];
148	if (!-r $input_filename) {
149	print STDERR "Error: unable to open $input_filename for reading\n";
150	exit(1);
151	}
152
153	# Deduce filenames
154	my ($tailname,$dirname,$suffix)
155	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
156	my $output_filestem = &util::filename_cat($dirname, "$tailname");
157
158	if ($input_type eq "")
159	{
160	$input_type = lc (substr($suffix,1,length($suffix)-1));
161	}
162
163	# Change to temporary working directory
164	my $stored_dir = cwd();
165	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
166
167	# Select convert utility
168	if (!defined $input_type) {
169	print STDERR "Error: No filename extension or input type defined\n";
170	exit(1);
171	}
172	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
173	print &convertDOC($input_filename, $output_filestem, $output_type);
174	print "\n";
175	}
176	elsif ($input_type eq "rtf") {
177	print &convertRTF($input_filename, $output_filestem, $output_type);
178	print "\n";
179	}
180	elsif ($input_type eq "pdf") {
181	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
182	print "\n";
183	}
184	elsif ($input_type eq "ps") {
185	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
186	print "\n";
187	}
188	elsif ($input_type =~ m/pptx?$/) {
189	print &convertPPT($input_filename, $output_filestem, $output_type);
190	print "\n";
191	}
192	elsif ($input_type =~ m/xlsx?$/) {
193	print &convertXLS($input_filename, $output_filestem, $output_type);
194	print "\n";
195	}
196	else {
197	print STDERR "Error: Unable to convert type '$input_type'\n";
198	exit(1);
199	}
200
201	# restore to original working directory
202	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
203
204	}
205
206	&main(@ARGV);
207
208
209
210	# Document-type conversion functions
211	#
212	# The following functions attempt to convert documents from their
213	# input type to the specified output type. If no output type was
214	# given, then they first attempt HTML, and then TEXT.
215	#
216	# Each returns the output type ("html" or "text") or "fail" if no
217	# conversion is possible.
218
219	# Convert a Microsoft word document
220
221	sub convertDOC {
222	my ($input_filename, $output_filestem, $output_type) = @_;
223
224	# Many .doc files are not in fact word documents!
225	my $realtype = &find_docfile_type($input_filename);
226
227	if ($realtype eq "word6" \|\| $realtype eq "word7"
228	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
229	return &convertWord678($input_filename, $output_filestem, $output_type);
230	} elsif ($realtype eq "rtf") {
231	return &convertRTF($input_filename, $output_filestem, $output_type);
232	} else {
233	return &convertAnything($input_filename, $output_filestem, $output_type);
234	}
235	}
236
237	# Convert a Microsoft word 6/7/8 document
238
239	sub convertWord678 {
240	my ($input_filename, $output_filestem, $output_type) = @_;
241
242	my $success = 0;
243	if (!$output_type \|\| ($output_type =~ m/html/i)){
244	if ($windows_scripting) {
245	$success = &native_doc_to_html($input_filename, $output_filestem);
246	}
247	else {
248	$success = &doc_to_html($input_filename, $output_filestem);
249	}
250	if ($success) {
251	return "html";
252	}
253	}
254	return &convertAnything($input_filename, $output_filestem, $output_type);
255	}
256
257
258	# Convert a Rich Text Format (RTF) file
259
260	sub convertRTF {
261	my ($input_filename, $output_filestem, $output_type) = @_;
262
263	my $success = 0;
264
265	# Attempt specialised conversion to HTML
266	if (!$output_type \|\| ($output_type =~ m/html/i)) {
267
268	if ($windows_scripting) {
269	$success = &native_doc_to_html($input_filename, $output_filestem);
270	}
271	else {
272	$success = &rtf_to_html($input_filename, $output_filestem);
273	}
274	if ($success) {
275	return "html";
276	}
277	}
278
279	# rtf is so ugly that's it's not worth running strings over.
280	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
281	# return &convertAnything($input_filename, $output_filestem, $output_type);
282	return "fail";
283	}
284
285
286	# Convert an unidentified file
287
288	sub convertAnything {
289	my ($input_filename, $output_filestem, $output_type) = @_;
290
291	my $success = 0;
292
293	# Attempt simple conversion to HTML
294	if (!$output_type \|\| ($output_type =~ m/html/i)) {
295	$success = &any_to_html($input_filename, $output_filestem);
296	if ($success) {
297	return "html";
298	}
299	}
300
301	# Convert to text
302	if (!$output_type \|\| ($output_type =~ m/text/i)) {
303	$success = &any_to_text($input_filename, $output_filestem);
304	if ($success) {
305	return "text";
306	}
307	}
308	return "fail";
309	}
310
311
312
313	# Convert an Adobe PDF document
314
315	sub convertPDF {
316	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
317
318	my $success = 0;
319	$output_type =~ s/.\-(.)/$1/i;
320	# Attempt coversion to Image
321	if ($output_type =~ m/jp?g\|gif\|png/i) {
322	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
323	if ($success){
324	return "item";
325	}
326	}
327
328	# Attempt conversion to HTML
329	if (!$output_type \|\| ($output_type =~ m/html/i)) {
330	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
331	if ($success) {
332	return "html";
333	}
334	}
335
336	# Attempt conversion to TEXT
337	if (!$output_type \|\| ($output_type =~ m/text/i)) {
338	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
339	if ($success) {
340	return "text";
341	}
342	}
343
344	return "fail";
345
346	}
347
348
349	# Convert an Adobe PostScript document
350
351	sub convertPS {
352	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
353
354	my $success = 0;
355	$output_type =~ s/.\-(.)/$1/i;
356	# Attempt coversion to Image
357	if ($output_type =~ m/jp?g\|gif\|png/i) {
358	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
359	if ($success){
360	return "item";
361	}
362	}
363
364	# Attempt conversion to TEXT
365	if (!$output_type \|\| ($output_type =~ m/text/i)) {
366	$success = &ps_to_text($input_filename, $output_filestem);
367	if ($success) {
368	return "text";
369	}
370	}
371	return "fail";
372	}
373
374
375	sub convertPPT {
376	my ($input_filename, $output_filestem, $output_type) = @_;
377	my $success = 0;
378
379	my $ppt_convert_type = "";
380
381	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
382	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
383	if ($output_type =~ m/gif/i) {
384	$ppt_convert_type = "-g";
385	} elsif ($output_type =~ m/jp?g/i){
386	$ppt_convert_type = "-j";
387	} elsif ($output_type =~ m/png/i){
388	$ppt_convert_type = "-p";
389	}
390	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
391	$ENV{'GSDLOS'}, "pptextract");
392	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
393
394	my $cmd = "";
395	if ($timeout) {$cmd = "ulimit -t $timeout;";}
396	# if the converting directory already exists
397	if (-d $output_filestem) {
398	print STDERR "**The conversion directory already exists\n";
399	return "item";
400	} else {
401	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
402	$cmd .= " 2>\"$output_filestem.err\""
403	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
404	if (system($cmd) !=0) {
405	print STDERR "Powerpoint VB Scripting convert failed\n";
406	} else {
407	return "item";
408	}
409	}
410	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
411	# Attempt conversion to HTML
412	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
413	# formulate the command
414	my $cmd = "";
415	my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
416	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
417	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
418	$cmd .= " 2>\"$output_filestem.err\""
419	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
420
421	# execute the command
422	$!=0;
423	if (system($cmd)!=0)
424	{
425	print STDERR "Powerpoint 95/97 converter failed $!\n";
426	} else {
427	return "html";
428	}
429	}
430
431	$success = &any_to_text($input_filename, $output_filestem);
432	if ($success) {
433	return "text";
434	}
435
436	return "fail";
437	}
438
439
440	sub convertXLS {
441	my ($input_filename, $output_filestem, $output_type) = @_;
442
443	my $success = 0;
444
445	# Attempt conversion to HTML
446	if (!$output_type \|\| ($output_type =~ m/html/i)) {
447	# formulate the command
448	my $cmd = "";
449	my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
450	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
451	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
452	$cmd .= " 2>\"$output_filestem.err\""
453	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
454
455
456	# execute the command
457	$!=0;
458	if (system($cmd)!=0)
459	{
460	print STDERR "Excel 95/97 converter failed $!\n";
461	} else {
462	return "html";
463	}
464	}
465
466	$success = &any_to_text($input_filename, $output_filestem);
467	if ($success) {
468	return "text";
469	}
470
471	return "fail";
472	}
473
474
475
476	# Find the real type of a .doc file
477	#
478	# We seem to have a lot of files with a .doc extension that are .rtf
479	# files or Word 5 files. This function attempts to tell the difference.
480	sub find_docfile_type {
481	my ($input_filename) = @_;
482
483	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
484	return "docx";
485	}
486
487	open(CHK, "<$input_filename");
488	binmode(CHK);
489	my $line = "";
490	my $first = 1;
491
492	while (<CHK>) {
493
494	$line = $_;
495
496	if ($first) {
497	# check to see if this is an rtf file
498	if ($line =~ m/^\{\\rtf/) {
499	close(CHK);
500	return "rtf";
501	}
502	$first = 0;
503	}
504
505	# is this is a word 6/7/8 document?
506	if ($line =~ m/Word\.Document\.([678])/) {
507	close(CHK);
508
509	return "word$1";
510	}
511
512	}
513
514	return "unknown";
515	}
516
517
518	# Specific type-to-type conversions
519	#
520	# Each of the following functions attempts to convert a document from
521	# a specific format to another. If they succeed they return 1 and leave
522	# the output document(s) in the appropriate place; if they fail they
523	# return 0 and delete any working files.
524
525
526	# Attempt to convert a word document to html with the wv program
527	sub doc_to_html {
528	my ($input_filename, $output_filestem) = @_;
529
530	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
531
532	if ( -d "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv" && $ENV{'GSDLOS'} eq "linux" ) {
533	$ENV{'PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/bin:$ENV{'PATH'}";
534	$ENV{'LD_LIBRARY_PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/lib:$ENV{'LD_LIBRARY_PATH'}";
535	$wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv", "bin", "wvWare");
536	}
537
538	# don't include path on windows (to avoid having to play about
539	# with quoting when GSDLHOME might contain spaces) but assume
540	# that the PATH is set up correctly
541	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
542
543	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
544	"packages", "wv", "wvHtml.xml");
545
546	# Added the following to work with replace_srcdoc_with_html.pl:
547	# Make wvWare put any associated (image) files of the word doc into
548	# folder docname-without-extention_files. This folder should be at
549	# the same level as the html file generated from the doc.
550	# wvWare will take care of proper interlinking.
551
552	# This step is necessary for replace_srcdoc_with_html.pl which will
553	# move the html and associated files into the import folder. We
554	# want to ensure that the associated files won't overwrite similarly
555	# named items already in import. Hence we put them in a folder first
556	# (to which the html links properly) and that will allow
557	# replace_srcdoc_with_html.pl to move them safely to /import.
558
559	# To do all this, we need to use wvWare's --dir and --basename options
560	# where dir is the full path to the image folder directory and
561	# basename is the full path to the image folder appended to the name
562	# which is to be prepended to every image file:
563	# eg. if the images were to have names like sample0.jpg to sampleN.jpg,
564	# then the basename is "/full/path/to/imgdir/sample".
565	# In this case, basename is the full path to and name of the document.
566	# HOWEVER: basename always takes full path, not relative url, so
567	# the greenstone browser is unable to display the images (absolute paths
568	# cause it to give an "external link" message)
569	# See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
570	# and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
571	# "added --dir option to wvHtml so that pictures can be placed in
572	# a seperate directory"
573	# "running wvWare through IMP to view word documents as html. It gets
574	# invoked like this:
575	# wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
576
577	# toppath is the folder where html is generated
578	# docname is the name (without extension) of the html to be generated
579	# suffix (extension) is thrown away
580	my ($docname, $toppath)
581	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
582
583	# We want the image folder generated to have the same name as windows
584	# would generate ($windows_scripting) when it converts from word to html.
585	# That is, foldername=docname_files
586	my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
587	#print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
588
589	# ensure this image directory exists
590	# if it exists already, just delete and recreate
591	if(-e $assoc_dir) {
592	&util::rm_r($assoc_dir);
593	}
594	&util::mk_dir($assoc_dir);
595
596	# the images are all going to be called image0, image1,..., imageN
597	my $img_basenames = &util::filename_cat($assoc_dir, $docname);
598
599	#print STDERR "**toppath: $toppath\n**docname: $docname\n;
600	#print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
601	#print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
602
603	my $cmd = "";
604	if ($timeout) {$cmd = "ulimit -t $timeout;";}
605	# wvWare's --dir and --basename options for image directory.
606	# Replaced the next line with the 2 lines following it:
607	# $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
608	$cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
609	$cmd .= " --charset utf-8 --config \"$wv_conf\"";
610	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
611
612	# redirecting STDERR is a bad idea on windows 95/98
613	$cmd .= " 2> \"$output_filestem.err\""
614	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
615	# execute the command
616	$!=0;
617	if (system($cmd)!=0)
618	{
619	print STDERR "Error executing wv converter:$!\n";
620	if (-s "$output_filestem.err") {
621	open (ERRFILE, "<$output_filestem.err");
622
623	my $write_to_fail_log=0;
624	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
625	{$write_to_fail_log=1;}
626
627	my $line;
628	while ($line=<ERRFILE>) {
629	if ($line =~ m/\w/) {
630	print STDERR "$line";
631	print FAILLOG "$line" if ($write_to_fail_log);
632	}
633	if ($line !~ m/startup error/) {next;}
634	print STDERR " (given an invalid .DOC file?)\n";
635	print FAILLOG " (given an invalid .DOC file?)\n"
636	if ($write_to_fail_log);
637
638	} # while ERRFILE
639	close FAILLOG if ($write_to_fail_log);
640	}
641	return 0; # we can try any_to_text
642	}
643
644	# Was the conversion successful?
645
646	if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
647	open(TMP, "$output_filestem.html");
648	my $line = <TMP>;
649	close(TMP);
650	if ($line && $line =~ m/DOCTYPE HTML/) {
651	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
652
653	# Inserted this code to remove the images directory if it was still empty after
654	# the html was generated (in case there were no images in the word document)
655	if (&util::is_dir_empty($assoc_dir)) {
656	#print STDERR "*gsConvert.pl: Image dir $assoc_dir is empty, removing*\n";
657	&util::rm_r($assoc_dir);
658	} else { # there was an image folder (it was generated)
659	# Therefore, the html file generated contains absolute links to the images
660	# Replace them with relative links instead, so the folder can be moved elsewhere
661	&make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
662	}
663	return 1;
664	}
665	}
666
667	# If here, an error of some sort occurred
668	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
669	if (-e "$output_filestem.err") {
670	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
671	open (ERRLOG,"$output_filestem.err");
672	while (<ERRLOG>) {print FAILLOG $_;}
673	close FAILLOG;
674	close ERRLOG;
675	}
676	&util::rm("$output_filestem.err");
677	}
678
679	return 0;
680	}
681
682	# Method to work with doc_to_html - Word docs might contain images.
683	# When such word docs are converted with wvWare, we make it generate a
684	# <filename>_files folder with the associated images, while the html file
685	# <filename> refers to the images using absolute paths to <filename>_files.
686	# This method reads in that html file and replaces all the absolute paths to
687	# the images in <filename>_files with the relative paths to the images from
688	# that folder. (I.e. with <filename>_files/<imagename.ext>).
689	sub make_links_to_assocdir_relative{
690	# toppath is the top-level folder in which the html file we're going to be fixing resides
691	# docname is just the name (without extension) of the html file
692	# html_file is the full path to the html file: /full/path/docname.html
693	# assoc_dir_path is toppath/docname_files
694	# assoc_dirname is the directory name of the folder with associated imgs: docname_files
695	my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
696
697	# 1. Read all the contents of the html into a string
698	# open the original file for reading
699	unless(open(FIN, "<$html_file")) {
700	print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
701	return 0;
702	}
703	# From http://perl.plover.com/local.html
704	# "It's cheaper to read the file all at once, without all the splitting and reassembling.
705	# (Some people call this slurping the file.) Perl has a special feature to support this:
706	# If the $/ variable is undefined, the <...> operator will read the entire file all at once"
707	my $html_contents;
708	{
709	local $/ = undef; # Read entire file at once
710	$html_contents = <FIN>; # Now file is read in as one single 'line'
711	}
712	close(FIN); # close the file
713	#print STDERR $html_contents;
714
715	# 2. Replace (substitute) all ocurrences of the assoc_dir_path in a hrefs and img src
716	# values with assoc_dirname
717	# At the end: g means substitute all occurrences (global), while s at the end means treat
718	# all new lines as a regular space. This interacts with g to consider all the lines
719	# together as a single line so that multi-occurrences can be replaced.
720
721	# we can't just replace $assoc_dir_path with $assoc_dir
722	# $assoc_dir_path represents a regular expression that needs to be replaced
723	# if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
724	# meaning in Perl regular expressions -- we need to escape these first
725	my $safe_reg_expression = $assoc_dir_path;
726	$safe_reg_expression =~ s/\\/\\\\/g;
727	$safe_reg_expression =~ s/\./\\./g;
728	$safe_reg_expression =~ s/\-/\\-/g;
729	$safe_reg_expression =~ s/\[/\\[/g;
730	$safe_reg_expression =~ s/\]/\\]/g;
731	$safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
732
733	# The following regular expression substitution looks for <a or <image, followed by any other
734	# attributes and values until it comes to the FIRST (indicated by ?) href= or src=
735	# followed by " or ' no quotes at all around path, followed by the associated folder's pathname
736	# followed by characters (for the img filename), then finally the optional closing quotes
737	# in " or ' form, followed by any other attributes and values until the first > to end the tag.
738	# The substitution: all the parts preceding associated folder's pathname are retained,
739	# the associated folder path name is replaced by associated folder directory name
740	# and the rest upto and including the closing > tag is retained.
741	# The sg at the end of the pattern match treats all of html_contents as a single line (s)
742	# and performs a global replace (g) meaning that all occurrences that match in that single line
743	# are substituted.
744	$html_contents =~ s/(<(a\|img).?(href\|src)=(\"\|\')?)$safe_reg_expression(.?(\"\|\')?.*?>)/$1$assoc_dirname$5/sg;
745	#$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
746	# now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
747	$html_contents =~ s/(<(a\|img).?(href\|src)=(\"\|\')?)(.)(.?(\"\|\')?.?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
748
749	#print STDERR "**assoc_dirname: $assoc_dirname*\n";
750	#print STDERR "**safe_reg_expression: $safe_reg_expression*\n";
751
752	# delete the original file and recreate it
753	my $copy_of_filename = $html_file;
754	&util::rm($copy_of_filename); # deleted the file
755
756	# Recreate the original file for writing the updated contents
757	unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
758	print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
759	return 0;
760	}
761
762	# write out the updated contents and close the file
763	print FOUT $html_contents;
764	close(FOUT);
765	return 1;
766	}
767
768	# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
769	# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
770	# introduced in link pathnames by wvWare into space again. Converts all percent signs
771	# introduced by URL encoding filenames generated into %25 in these url links referencing them
772	sub post_process_assocfile_urls
773	{
774	my ($pre, $text, $post) = @_;
775
776	$text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
777	# $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
778	$text =~ s/\\/\//g;
779	$text =~ s/%/%25/g;
780
781	return "$pre$text$post";
782	}
783
784	# Attempt to convert a word document to html with the word2html scripting program
785	sub native_doc_to_html {
786	my ($input_filename, $output_filestem) = @_;
787
788	# build up the path to the doc-to-html conversion tool we're going to use
789	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
790
791	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
792	# if windows scripting with docx input, use new VBscript to get the local Word install (if
793	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
794
795	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
796	# else script launch fails when there are error msgs
797	$vbScript = &util::filename_cat($vbScript, "docx2html.vbs");
798	$vbScript = "CScript //Nologo \"$vbScript\""; # launche with CScript for error output in STDERR
799	# //Nologo flag avoids Microsoft's opening/logo msgs
800	}
801	else { # old doc versions. use the usual VB executable word2html for the
802	# conversion. Doesn't need full path, since bin\windows is on PATH
803	$vbScript = "word2html"; #$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
804	}
805	}
806	else { # not windows
807	$vbScript = "\"".&util::filename_cat($vbScript, "word2html")."\"";
808	}
809
810	if (-e "$output_filestem.html") {
811	print STDERR " The conversion file:\n";
812	print STDERR " $output_filestem.html\n";
813	print STDERR " ... already exists. Skipping\n";
814	return 1;
815	}
816
817	my $cmd = "";
818	if ($timeout) {$cmd = "ulimit -t $timeout;";}
819	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
820	#$cmd .= "$vbScript $input_filename $output_filestem.html";
821	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
822
823	# redirecting STDERR
824
825	$cmd .= " 2> \"$output_filestem.err\""
826	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
827	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
828
829	# execute the command
830	$!=0;
831	if (system($cmd)!=0)
832	{
833	print STDERR "Error executing $vbScript converter:$!\n";
834	if (-s "$output_filestem.err") {
835	open (ERRFILE, "<$output_filestem.err");
836
837	my $write_to_fail_log=0;
838	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
839	{$write_to_fail_log=1;}
840
841	my $line;
842	while ($line=<ERRFILE>) {
843	if ($line =~ m/\w/) {
844	print STDERR "$line";
845	print FAILLOG "$line" if ($write_to_fail_log);
846	}
847	if ($line !~ m/startup error/) {next;}
848	print STDERR " (given an invalid .DOC file?)\n";
849	print FAILLOG " (given an invalid .DOC file?)\n"
850	if ($write_to_fail_log);
851
852	} # while ERRFILE
853	close FAILLOG if ($write_to_fail_log);
854	}
855	return 0; # we can try any_to_text
856	}
857
858	# Was the conversion successful?
859	if (-s "$output_filestem.html") {
860	open(TMP, "$output_filestem.html");
861	my $line = <TMP>;
862	close(TMP);
863	if ($line && $line =~ m/html/i) {
864	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
865	return 1;
866	}
867	}
868
869	# If here, an error of some sort occurred
870	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
871	if (-e "$output_filestem.err") {
872	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
873	open (ERRLOG,"$output_filestem.err");
874	while (<ERRLOG>) {print FAILLOG $_;}
875	close FAILLOG;
876	close ERRLOG;
877	}
878	&util::rm("$output_filestem.err");
879	}
880	return 0;
881	}
882
883	# Attempt to convert an RTF document to html with rtftohtml
884	sub rtf_to_html {
885	my ($input_filename, $output_filestem) = @_;
886
887	# formulate the command
888	my $cmd = "";
889	if ($timeout) {$cmd = "ulimit -t $timeout;";}
890	$cmd .= "rtftohtml";
891	#$cmd .= "rtf-converter";
892
893	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
894
895	$cmd .= " 2>\"$output_filestem.err\""
896	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
897
898
899	# execute the command
900	$!=0;
901	if (system($cmd)!=0)
902	{
903	print STDERR "Error executing rtf converter $!\n";
904	# don't currently bother printing out error log...
905	# keep going, in case it still created an HTML file...
906	}
907
908	# Was the conversion successful?
909	my $was_successful=0;
910	if (-s "$output_filestem.html") {
911	# make sure we have some content other than header
912	open (HTML, "$output_filestem.html"); # what to do if fail?
913	my $line;
914	my $past_header=0;
915	while ($line=<HTML>) {
916
917	if ($past_header == 0) {
918	if ($line =~ m/<body>/) {$past_header=1;}
919	next;
920	}
921
922	$line =~ s/<[^>]+>//g;
923	if ($line =~ m/\w/ && $past_header) { # we found some content...
924	$was_successful=1;
925	last;
926	}
927	}
928	close HTML;
929	}
930
931	if ($was_successful) {
932	&util::rm("$output_filestem.err")
933	if (-e "$output_filestem.err");
934	# insert the (modified) table of contents, if it exists.
935	if (-e "${output_filestem}_ToC.html") {
936	&util::mv("$output_filestem.html","$output_filestem.src");
937	my $open_failed=0;
938	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
939	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
940	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
941
942	if ($open_failed) {
943	close HTMLSRC;
944	close TOC;
945	close HTML;
946	&util::mv("$output_filestem.src","$output_filestem.html");
947	return 1;
948	}
949
950	# print out header info from src html.
951	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
952	print HTML "$_";
953	}
954
955	# print out table of contents, making links relative
956	<TOC>; <TOC>; # ignore first 2 lines
957	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
958	my $line;
959	while ($line=<TOC>) {
960	$line =~ s@</body></html>$@@i ; # only last line has this
961	# make link relative
962	$line =~ s@href=\"[^\#]+@href=\"@i;
963	print HTML $line;
964	}
965	close TOC;
966
967	# rest of html src
968	while (<HTMLSRC>) {
969	print HTML $_;
970	}
971	close HTMLSRC;
972	close HTML;
973
974	&util::rm("${output_filestem}_ToC.html");
975	&util::rm("${output_filestem}.src");
976	}
977	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
978	return 1; # success
979	}
980
981	if (-e "$output_filestem.err") {
982	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
983	{
984	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
985	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
986	print FAILLOG " (rtf file might be too recent):\n";
987	open (ERRLOG, "$output_filestem.err");
988	while (<ERRLOG>) {print FAILLOG $_;}
989	close ERRLOG;
990	close FAILLOG;
991	}
992	&util::rm("$output_filestem.err");
993	}
994
995	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
996
997	return 0;
998	}
999
1000
1001	# Convert a pdf file to html with the pdftohtml command
1002
1003	sub pdf_to_html {
1004	my ($dirname, $input_filename, $output_filestem) = @_;
1005
1006	my $cmd = "";
1007	if ($timeout) {$cmd = "ulimit -t $timeout;";}
1008	my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
1009	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
1010	$cmd .= " -c" if ($pdf_complex);
1011	$cmd .= " -i" if ($pdf_ignore_images);
1012	$cmd .= " -a" if ($pdf_allow_images_only);
1013	$cmd .= " -hidden" unless ($pdf_nohidden);
1014	$cmd .= " \"$input_filename\" \"$output_filestem\"";
1015
1016	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
1017	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1018	} else {
1019	$cmd .= " > \"$output_filestem.err\"";
1020	}
1021
1022	$!=0;
1023
1024	my $retval=system($cmd);
1025	if ($retval!=0)
1026	{
1027	print STDERR "Error executing pdftohtml.pl";
1028	if ($!) {print STDERR ": $!";}
1029	print STDERR "\n";
1030	}
1031
1032	# make sure the converter made something
1033	if ($retval!=0 \|\| ! -s "$output_filestem.html")
1034	{
1035	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1036	# print out the converter's std err, if any
1037	if (-s "$output_filestem.err") {
1038	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1039	print STDERR "pdftohtml error log:\n";
1040	while (<ERRLOG>) {
1041	print STDERR "$_";
1042	}
1043	close ERRLOG;
1044	}
1045	print STDERR "***********output filestem $output_filestem.html\n";
1046	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1047	if (-e "$output_filestem.err") {
1048	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1049	{
1050	open (ERRLOG, "$output_filestem.err");
1051	while (<ERRLOG>) {print FAILLOG $_;}
1052	close ERRLOG;
1053	close FAILLOG;
1054	}
1055	&util::rm("$output_filestem.err");
1056	}
1057	return 0;
1058	}
1059
1060	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1061	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1062	return 1;
1063	}
1064
1065	# Convert a pdf file to various types of image with the convert command
1066
1067	sub pdfps_to_img {
1068	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
1069
1070	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
1071	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
1072	my $result = `identify 2>&1`;
1073	if ($? == -1 \|\| $? == 256) { # Linux and Windows return different values for "program not found"
1074	#ImageMagick is not installed, thus the convert utility is not available.
1075	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
1076	return 0;
1077	}
1078	}
1079
1080	my $cmd = "";
1081	if ($timeout) {$cmd = "ulimit -t $timeout;";}
1082	$output_type =~ s/.\_(.)/$1/i;
1083	my $full_perl_path = &util::filename_cat($ENV{'PERLPATH'},"perl");
1084	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
1085	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
1086	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1087	} else {
1088	$cmd .= " > \"$output_filestem.err\"";
1089	}
1090
1091	# don't include path on windows (to avoid having to play about
1092	# with quoting when GSDLHOME might contain spaces) but assume
1093	# that the PATH is set up correctly
1094	$!=0;
1095	my $retval=system($cmd);
1096	if ($retval!=0)
1097	{
1098	print STDERR "Error executing pdftoimg.pl";
1099	if ($!) {print STDERR ": $!";}
1100	print STDERR "\n";
1101	}
1102
1103	#make sure the converter made something
1104	#if ($retval !=0) \|\| ! -s "$output_filestem")
1105	if ($retval !=0)
1106	{
1107	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1108	#print out the converter's std err, if any
1109	if (-s "$output_filestem.err") {
1110	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1111	print STDERR "pdfpstoimg error log:\n";
1112	while (<ERRLOG>) {
1113	print STDERR "$_";
1114	}
1115	close ERRLOG;
1116	}
1117	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1118	if (-e "$output_filestem.err") {
1119	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1120	{
1121	open (ERRLOG, "$output_filestem.err");
1122	while (<ERRLOG>) {print FAILLOG $_;}
1123	close ERRLOG;
1124	close FAILLOG;
1125	}
1126	&util::rm("$output_filestem.err");
1127	}
1128	return 0;
1129	}
1130	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1131	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1132	return 1;
1133	}
1134
1135	# Convert a PDF file to text with the pdftotext command
1136
1137	sub pdf_to_text {
1138	my ($dirname, $input_filename, $output_filestem) = @_;
1139
1140	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1141
1142	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1143	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1144	} else {
1145	$cmd .= " > \"$output_filestem.err\"";
1146	}
1147
1148	if (system($cmd)!=0)
1149	{
1150	print STDERR "Error executing $cmd: $!\n";
1151	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1152	}
1153
1154	# make sure there is some extracted text.
1155	if (-e "$output_filestem.text") {
1156	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
1157	binmode(EXTR_TEXT); # just in case...
1158	my $line="";
1159	my $seen_text=0;
1160	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1161	if ($line=~ m/\w/) {$seen_text=1;}
1162	}
1163	close EXTR_TEXT;
1164	if ($seen_text==0) { # no text was extracted
1165	print STDERR "Error: pdftotext found no text\n";
1166	&util::rm("$output_filestem.text");
1167	}
1168	}
1169
1170	# make sure the converter made something
1171	if (! -s "$output_filestem.text")
1172	{
1173	# print out the converters std err, if any
1174	if (-s "$output_filestem.err") {
1175	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1176	print STDERR "pdftotext error log:\n";
1177	while (<ERRLOG>) {
1178	print STDERR "$_";
1179	}
1180	close ERRLOG;
1181	}
1182	# does this converter create a .out file?
1183	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1184	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1185	if (-e "$output_filestem.err") {
1186	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1187	{
1188	open (ERRLOG,"$output_filestem.err");
1189	while (<ERRLOG>) {print FAILLOG $_;}
1190	close ERRLOG;
1191	close FAILLOG;
1192	}
1193	&util::rm("$output_filestem.err");
1194	}
1195	return 0;
1196	}
1197	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1198	return 1;
1199	}
1200
1201	# Convert a PostScript document to text
1202	# note - just using "ps2ascii" isn't good enough, as it
1203	# returns 0 for a postscript interpreter error. ps2ascii is just
1204	# a wrapper to "gs" anyway, so we use that cmd here.
1205
1206	sub ps_to_text {
1207	my ($input_filename, $output_filestem) = @_;
1208
1209	my $error = "";
1210
1211	# if we're on windows we'll fall straight through without attempting
1212	# to use gs
1213	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1214	$error = "Windows does not support gs";
1215
1216	} else {
1217	my $cmd = "";
1218	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1219	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1220	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1221	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1222	$cmd .= " 2> $output_filestem.err";
1223	$!=0;
1224
1225	my $retcode=system($cmd);
1226	$retcode = $? >> 8; # see man perlfunc - system for this...
1227	# if system returns -1 \| 127 (couldn't start program), look at $! for message
1228
1229	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1230	elsif (! -e "$output_filestem.text") {
1231	$error="did not create output file.\n";
1232	}
1233	else
1234	{ # make sure the interpreter didn't get an error. It is technically
1235	# possible for the actual text to start with this, but....
1236	open PSOUT, "$output_filestem.text";
1237	if (<PSOUT> =~ m/^Error: (.*)/) {
1238	$error="interpreter error - \"$1\"";
1239	}
1240	close PSOUT;
1241	}
1242	}
1243
1244	if ($error ne "")
1245	{
1246	print STDERR "Warning: Error executing gs: $error\n";
1247	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1248
1249	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1250	{
1251	print FAILLOG "gs - $error\n";
1252	if (-e "$output_filestem.err") {
1253	open(ERRLOG, "$output_filestem.err");
1254	while (<ERRLOG>) {print FAILLOG $_;}
1255	close ERRLOG;
1256	}
1257	close FAILLOG;
1258	}
1259	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1260
1261
1262	# Fine then. We'll just do a lousy job by ourselves...
1263	# Based on 5-line regexp sed script found at:
1264	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1265	#
1266	print STDERR "Stripping text from postscript\n";
1267	my $errorcode=0;
1268	open (IN, "$input_filename")
1269	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1270	open (OUT, ">$output_filestem.text")
1271	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1272	if ($errorcode) {print STDERR "errors\n";return 0;}
1273
1274	my $text=""; # this is for whole .ps file...
1275	$text = join('', <IN>); # see man perlport, under "System Resources"
1276	close IN;
1277
1278	# Make sure this is a ps file...
1279	if ($text !~ m/^%!/) {
1280	print STDERR "Bad postscript header: not '%!'\n";
1281	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1282	{
1283	print FAILLOG "Bad postscript header: not '%!'\n";
1284	close FAILLOG;
1285	}
1286	return 0;
1287	}
1288
1289	# if ps has Page data, then use it to delete all stuff before it.
1290	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1291
1292	# remove all leading non-data stuff
1293	$text =~ s/^.*?\(//s;
1294
1295	# remove all newline chars for easier processing
1296	$text =~ s/\n//g;
1297
1298	# Big assumption here - assume that if any co-ordinates are
1299	# given, then we are at the end of a sentence.
1300	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1301
1302	# special characters--
1303	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1304
1305	# ? ps text formatting (eg italics?) ?
1306	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1307	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1308	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1309	# default - remove the rest
1310	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1311
1312	# attempt to add whitespace between words...
1313	# this is based purely on observation, and may be completely wrong...
1314	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1315	# eg I notice "b(" is sometimes NOT a space if preceded by a
1316	# negative number.
1317	$text =~ s/\)\d+ ?b\(/\) \( /g;
1318
1319	# change quoted braces to brackets
1320	$text =~ s/([^\\])\\\(/$1\{/g;
1321	$text =~ s/([^\\])\\\)/$1\}/g ;
1322
1323	# remove everything that is not between braces
1324	$text =~ s/\)([^\(\)])+?\(//sg ;
1325
1326	# remove any Trailer eof stuff.
1327	$text =~ s/\)[^\)]*$//sg;
1328
1329	### ligatures have special characters...
1330	$text =~ s/\\013/ff/g;
1331	$text =~ s/\\014/fi/g;
1332	$text =~ s/\\015/fl/g;
1333	$text =~ s/\\016/ffi/g;
1334	$text =~ s/\\214/fi/g;
1335	$text =~ s/\\215/fl/g;
1336	$text =~ s/\\017/\n\* /g; # asterisk?
1337	$text =~ s/\\023/\023/g; # e acute ('e)
1338	$text =~ s/\\177/\252/g; # u"
1339	# $text =~ s/ ?? /\344/g; # a"
1340
1341	print OUT "$text";
1342	close OUT;
1343	}
1344	# wrap the text - use a minimum length. ie, first space after this length.
1345	my $wrap_length=72;
1346	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1347	open INFILE, "$output_filestem.text.tmp" \|\|
1348	die "Couldn't open file: $!";
1349	open OUTFILE, ">$output_filestem.text" \|\|
1350	die "Couldn't open file for writing: $!";
1351	my $line="";
1352	while ($line=<INFILE>) {
1353	while (length($line)>0) {
1354	if (length($line)>$wrap_length) {
1355	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1356	print OUTFILE "$1\n";
1357	} else {
1358	print OUTFILE "$line";
1359	$line="";
1360	}
1361	}
1362	}
1363	close INFILE;
1364	close OUTFILE;
1365	&util::rm("$output_filestem.text.tmp");
1366
1367	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1368	return 1;
1369	}
1370
1371
1372	# Convert any file to HTML with a crude perl implementation of the
1373	# UNIX strings command.
1374
1375	sub any_to_html {
1376	my ($input_filename, $output_filestem) = @_;
1377
1378	# First generate a text file
1379	return 0 unless (&any_to_text($input_filename, $output_filestem));
1380
1381	# create an HTML file from the text file
1382	open(TEXT, "<$output_filestem.text");
1383	open(HTML, ">$output_filestem.html");
1384
1385	print HTML "<html><head>\n";
1386	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1387	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1388	print HTML "</head><body>\n\n";
1389
1390	my $line;
1391	while ($line=<TEXT>) {
1392	$line =~ s/</</g;
1393	$line =~ s/>/>/g;
1394	if ($line =~ m/^\s*$/) {
1395	print HTML "<p>";
1396	} else {
1397	print HTML "<br> ", $line;
1398	}
1399	}
1400	print HTML "\n</body></html>\n";
1401
1402	close HTML;
1403	close TEXT;
1404
1405	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1406	return 1;
1407	}
1408
1409	# Convert any file to TEXT with a crude perl implementation of the
1410	# UNIX strings command.
1411	# Note - this assumes ascii charsets :( (jrm21)
1412
1413	sub any_to_text {
1414	my ($input_filename, $output_filestem) = @_;
1415
1416	if (!$use_strings) {
1417	return 0;
1418	}
1419
1420	print STDERR "\n** In any to text**\n\n";
1421	open(IN, "<$input_filename") \|\| return 0;
1422	binmode(IN);
1423	open(OUT, ">$output_filestem.text") \|\| return 0;
1424
1425	my ($line);
1426	my $output_line_count = 0;
1427	while (<IN>) {
1428	$line = $_;
1429
1430	# delete anything that isn't a printable character
1431	$line =~ s/[^\040-\176]+/\n/sg;
1432
1433	# delete any string less than 10 characters long
1434	$line =~ s/^.{0,9}$/\n/mg;
1435	while ($line =~ m/^.{1,9}$/m) {
1436	$line =~ s/^.{0,9}$/\n/mg;
1437	$line =~ s/\n+/\n/sg;
1438	}
1439
1440	# remove extraneous whitespace
1441	$line =~ s/\n+/\n/gs;
1442	$line =~ s/^\n//gs;
1443
1444	# output whatever is left
1445	if ($line =~ m/[^\n ]/) {
1446	print OUT $line;
1447	++$output_line_count;
1448	}
1449	}
1450
1451	close OUT;
1452	close IN;
1453
1454	if ($output_line_count) { # try to protect against binary only formats
1455	return 1;
1456	}
1457
1458	&util::rm("$output_filestem.text");
1459	return 0;
1460
1461	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: