Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 22446

Last change on this file since 22446 was 22429, checked in by davidb, 14 years ago
Support of using OpenOffice scripting through JODConverter.jar added. Also added in 'use strict' and then fixed up a variety of places that needed 'my' added
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 48.3 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use strict;
52
53	use parsargv;
54	use util;
55	use Cwd;
56	use File::Basename;
57
58	# Are we running on WinNT or Win2000 (or later)?
59	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62	my $use_strings;
63	my $pdf_complex;
64	my $pdf_nohidden;
65	my $pdf_zoom;
66	my $pdf_ignore_images;
67	my $pdf_allow_images_only;
68	my $windows_scripting;
69	my $openoffice_scripting;
70
71	sub print_usage
72	{
73	print STDERR "\n";
74	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
75	print STDERR " or text using third-party programs.\n\n";
76	print STDERR " usage: $0 [options] filename\n";
77	if ($openoffice_scripting) {
78	print STDERR " options:\n\t-type\tdoc\|dot\|docx\|odf\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
79	}
80	else {
81	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
82	}
83	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
84	print STDERR "\t-output\tauto\|html\|text\|pagedimage_jpg\|pagedimage_gif\|pagedimage_png\t(output file type)\n";
85	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
86	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
87	print STDERR "\t-windows_scripting\tuse windows script (if available) when converting Microsoft Word and PPT via VB script\n";
88	print STDERR "\t-openoffice_scripting\tuse openoffice script (if available) when converting Microsoft Word and PPT via OpenOffice\n";
89	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
90	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
91	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
92	print STDERR "\t\tconverting PDF to HTML\n";
93	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
94	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
95	print STDERR "\t\t-pdf_complex is set\n";
96	exit(1);
97	}
98
99	my $faillogfile="";
100	my $timeout=0;
101
102	sub main
103	{
104	my (@ARGV) = @_;
105	my ($input_type,$output_type,$verbose);
106
107
108	# scan for -openoffice_scripting as it effects the permissible
109	# values for -type
110
111	foreach my $a (@ARGV) {
112	if ($a =~ m/^-openoffice_scripting$/) {
113	$openoffice_scripting = 1;
114	last;
115	}
116	}
117
118	my $parse_type;
119	if ($openoffice_scripting) {
120	$parse_type = 'type/(doc\|dot\|docx\|odf\|pdf\|ps\|ppt\|rtf\|xls)/';
121	}
122	else {
123	$parse_type = 'type/(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)/';
124	}
125
126	# read command-line arguments
127	if (!parsargv::parse(\@ARGV,
128	$parse_type, \$input_type,
129	'/errlog/.*/', \$faillogfile,
130	'output/(auto\|html\|text\|pagedimage).*/', \$output_type,
131	'timeout/\d+/0',\$timeout,
132	'verbose/\d+/0', \$verbose,
133	'windows_scripting',\$windows_scripting,
134	'openoffice_scripting',\$openoffice_scripting,
135	'use_strings', \$use_strings,
136	'pdf_complex', \$pdf_complex,
137	'pdf_ignore_images', \$pdf_ignore_images,
138	'pdf_allow_images_only', \$pdf_allow_images_only,
139	'pdf_nohidden', \$pdf_nohidden,
140	'pdf_zoom/\d+/2', \$pdf_zoom
141	))
142	{
143	print_usage();
144	}
145
146	# Make sure the input file exists and can be opened for reading
147	if (scalar(@ARGV!=1)) {
148	print_usage();
149	}
150
151	my $input_filename = $ARGV[0];
152	if (!-r $input_filename) {
153	print STDERR "Error: unable to open $input_filename for reading\n";
154	exit(1);
155	}
156
157	# Deduce filenames
158	my ($tailname,$dirname,$suffix)
159	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
160	my $output_filestem = &util::filename_cat($dirname, "$tailname");
161
162	if ($input_type eq "")
163	{
164	$input_type = lc (substr($suffix,1,length($suffix)-1));
165	}
166
167	# Change to temporary working directory
168	my $stored_dir = cwd();
169	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
170
171	# Select convert utility
172	if (!defined $input_type) {
173	print STDERR "Error: No filename extension or input type defined\n";
174	exit(1);
175	}
176	elsif ($openoffice_scripting && (($input_type eq "docx") \|\| ($input_type eq "odf"))) {
177	print &convertDOC($input_filename, $output_filestem, $output_type);
178	print "\n";
179	}
180	elsif ($input_type eq "doc" \|\| $input_type eq "dot") {
181	print &convertDOC($input_filename, $output_filestem, $output_type);
182	print "\n";
183	}
184	elsif ($input_type eq "rtf") {
185	print &convertRTF($input_filename, $output_filestem, $output_type);
186	print "\n";
187	}
188	elsif ($input_type eq "pdf") {
189	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
190	print "\n";
191	}
192	elsif ($input_type eq "ps") {
193	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
194	print "\n";
195	}
196	elsif ($input_type eq "ppt") {
197	print &convertPPT($input_filename, $output_filestem, $output_type);
198	print "\n";
199	}
200	elsif ($input_type eq "xls") {
201	print &convertXLS($input_filename, $output_filestem, $output_type);
202	print "\n";
203	}
204	else {
205	print STDERR "Error: Unable to convert type '$input_type'\n";
206	exit(1);
207	}
208
209	# restore to original working directory
210	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
211
212	}
213
214	&main(@ARGV);
215
216
217
218	# Document-type conversion functions
219	#
220	# The following functions attempt to convert documents from their
221	# input type to the specified output type. If no output type was
222	# given, then they first attempt HTML, and then TEXT.
223	#
224	# Each returns the output type ("html" or "text") or "fail" if no
225	# conversion is possible.
226
227	# Convert a Microsoft word document
228
229	sub convertDOC {
230	my ($input_filename, $output_filestem, $output_type) = @_;
231
232	if (($openoffice_scripting) && ($input_filename =~ m/\.docx?$/i)) {
233	# Jump right in and process with Open Office
234	if (openoffice_doc_to_html($input_filename, $output_filestem)) {
235	return "html";
236	}
237	else {
238	return "fail";
239	}
240	}
241
242	# Many .doc files are not in fact word documents!
243	my $realtype = &find_docfile_type($input_filename);
244
245	if ($realtype eq "word6" \|\| $realtype eq "word7" \|\| $realtype eq "word8") {
246	return &convertWord678($input_filename, $output_filestem, $output_type);
247	} elsif ($realtype eq "rtf") {
248	return &convertRTF($input_filename, $output_filestem, $output_type);
249	} else {
250	return &convertAnything($input_filename, $output_filestem, $output_type);
251	}
252	}
253
254	# Convert a Microsoft word 6/7/8 document
255
256	sub convertWord678 {
257	my ($input_filename, $output_filestem, $output_type) = @_;
258
259	my $success = 0;
260	if (!$output_type \|\| ($output_type =~ m/html/i)){
261	if ($windows_scripting) {
262	$success = &native_doc_to_html($input_filename, $output_filestem);
263	}
264	elsif ($openoffice_scripting) {
265	$success = &openoffice_doc_to_html($input_filename, $output_filestem);
266	}
267	else {
268	$success = &doc_to_html($input_filename, $output_filestem);
269	}
270	if ($success) {
271	return "html";
272	}
273	}
274	return &convertAnything($input_filename, $output_filestem, $output_type);
275	}
276
277
278	# Convert a Rich Text Format (RTF) file
279
280	sub convertRTF {
281	my ($input_filename, $output_filestem, $output_type) = @_;
282
283	my $success = 0;
284
285	# Attempt specialised conversion to HTML
286	if (!$output_type \|\| ($output_type =~ m/html/i)) {
287
288	if ($windows_scripting) {
289	$success = &native_doc_to_html($input_filename, $output_filestem);
290	}
291	elsif ($openoffice_scripting) {
292	$success = &openoffice_doc_to_html($input_filename, $output_filestem);
293	}
294	else {
295	$success = &rtf_to_html($input_filename, $output_filestem);
296	}
297	if ($success) {
298	return "html";
299	}
300	}
301
302	# rtf is so ugly that's it's not worth running strings over.
303	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
304	# return &convertAnything($input_filename, $output_filestem, $output_type);
305	return "fail";
306	}
307
308
309	# Convert an unidentified file
310
311	sub convertAnything {
312	my ($input_filename, $output_filestem, $output_type) = @_;
313
314	my $success = 0;
315
316	# Attempt simple conversion to HTML
317	if (!$output_type \|\| ($output_type =~ m/html/i)) {
318	$success = &any_to_html($input_filename, $output_filestem);
319	if ($success) {
320	return "html";
321	}
322	}
323
324	# Convert to text
325	if (!$output_type \|\| ($output_type =~ m/text/i)) {
326	$success = &any_to_text($input_filename, $output_filestem);
327	if ($success) {
328	return "text";
329	}
330	}
331	return "fail";
332	}
333
334
335
336	# Convert an Adobe PDF document
337
338	sub convertPDF {
339	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
340
341	my $success = 0;
342	$output_type =~ s/.\-(.)/$1/i;
343	# Attempt coversion to Image
344	if ($output_type =~ m/jp?g\|gif\|png/i) {
345	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
346	if ($success){
347	return "item";
348	}
349	}
350
351	# Attempt conversion to HTML
352	if (!$output_type \|\| ($output_type =~ m/html/i)) {
353	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
354	if ($success) {
355	return "html";
356	}
357	}
358
359	# Attempt conversion to TEXT
360	if (!$output_type \|\| ($output_type =~ m/text/i)) {
361	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
362	if ($success) {
363	return "text";
364	}
365	}
366
367	return "fail";
368
369	}
370
371
372	# Convert an Adobe PostScript document
373
374	sub convertPS {
375	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
376
377	my $success = 0;
378	$output_type =~ s/.\-(.)/$1/i;
379	# Attempt coversion to Image
380	if ($output_type =~ m/jp?g\|gif\|png/i) {
381	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
382	if ($success){
383	return "item";
384	}
385	}
386
387	# Attempt conversion to TEXT
388	if (!$output_type \|\| ($output_type =~ m/text/i)) {
389	$success = &ps_to_text($input_filename, $output_filestem);
390	if ($success) {
391	return "text";
392	}
393	}
394	return "fail";
395	}
396
397
398	sub convertPPT {
399	my ($input_filename, $output_filestem, $output_type) = @_;
400	my $success = 0;
401
402	my $ppt_convert_type = "";
403	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
404	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
405	if ($output_type =~ m/gif/i) {
406	$ppt_convert_type = "-g";
407	} elsif ($output_type =~ m/jp?g/i){
408	$ppt_convert_type = "-j";
409	} elsif ($output_type =~ m/png/i){
410	$ppt_convert_type = "-p";
411	}
412	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
413	$ENV{'GSDLOS'}, "pptextract");
414	$vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
415
416	my $cmd = "";
417	if ($timeout) {$cmd = "ulimit -t $timeout;";}
418	# if the converting directory already exists
419	if (-d $output_filestem) {
420	print STDERR "**The conversion directory already exists\n";
421	return "item";
422	} else {
423	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
424	$cmd .= " 2>\"$output_filestem.err\""
425	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
426	if (system($cmd) !=0) {
427	print STDERR "Powerpoint VB Scripting convert failed\n";
428	} else {
429	return "item";
430	}
431	}
432	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
433	# Attempt conversion to HTML
434	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
435	# formulate the command
436	my $cmd = "";
437	$cmd .= "perl -S ppttohtml.pl ";
438	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
439	$cmd .= " 2>\"$output_filestem.err\""
440	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
441
442	# execute the command
443	$!=0;
444	if (system($cmd)!=0)
445	{
446	print STDERR "Powerpoint 95/97 converter failed $!\n";
447	} else {
448	return "html";
449	}
450	}
451
452	$success = &any_to_text($input_filename, $output_filestem);
453	if ($success) {
454	return "text";
455	}
456
457	return "fail";
458	}
459
460
461	sub convertXLS {
462	my ($input_filename, $output_filestem, $output_type) = @_;
463
464	my $success = 0;
465
466	# Attempt conversion to HTML
467	if (!$output_type \|\| ($output_type =~ m/html/i)) {
468	# formulate the command
469	my $cmd = "";
470	$cmd .= "perl -S xlstohtml.pl ";
471	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
472	$cmd .= " 2>\"$output_filestem.err\""
473	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
474
475
476	# execute the command
477	$!=0;
478	if (system($cmd)!=0)
479	{
480	print STDERR "Excel 95/97 converter failed $!\n";
481	} else {
482	return "html";
483	}
484	}
485
486	$success = &any_to_text($input_filename, $output_filestem);
487	if ($success) {
488	return "text";
489	}
490
491	return "fail";
492	}
493
494
495
496	# Find the real type of a .doc file
497	#
498	# We seem to have a lot of files with a .doc extension that are .rtf
499	# files or Word 5 files. This function attempts to tell the difference.
500	sub find_docfile_type {
501	my ($input_filename) = @_;
502
503	open(CHK, "<$input_filename");
504	binmode(CHK);
505	my $line = "";
506	my $first = 1;
507
508	while (<CHK>) {
509
510	$line = $_;
511
512	if ($first) {
513	# check to see if this is an rtf file
514	if ($line =~ m/^\{\\rtf/) {
515	close(CHK);
516	return "rtf";
517	}
518	$first = 0;
519	}
520
521	# is this is a word 6/7/8 document?
522	if ($line =~ m/Word\.Document\.([678])/) {
523	close(CHK);
524	return "word$1";
525	}
526
527	}
528
529	return "unknown";
530	}
531
532
533	# Specific type-to-type conversions
534	#
535	# Each of the following functions attempts to convert a document from
536	# a specific format to another. If they succeed they return 1 and leave
537	# the output document(s) in the appropriate place; if they fail they
538	# return 0 and delete any working files.
539
540
541	# Attempt to convert a word document to html with the wv program
542	sub doc_to_html {
543	my ($input_filename, $output_filestem) = @_;
544
545	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wvWare");
546
547	if ( -d "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv" && $ENV{'GSDLOS'} eq "linux" ) {
548	$ENV{'PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/bin:$ENV{'PATH'}";
549	$ENV{'LD_LIBRARY_PATH'} = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}/wv/lib:$ENV{'LD_LIBRARY_PATH'}";
550	$wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'}, "wv", "bin", "wvWare");
551	}
552
553	# don't include path on windows (to avoid having to play about
554	# with quoting when GSDLHOME might contain spaces) but assume
555	# that the PATH is set up correctly
556	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
557
558	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
559	"packages", "wv", "wvHtml.xml");
560
561	# Added the following to work with replace_srcdoc_with_html.pl:
562	# Make wvWare put any associated (image) files of the word doc into
563	# folder docname-without-extention_files. This folder should be at
564	# the same level as the html file generated from the doc.
565	# wvWare will take care of proper interlinking.
566
567	# This step is necessary for replace_srcdoc_with_html.pl which will
568	# move the html and associated files into the import folder. We
569	# want to ensure that the associated files won't overwrite similarly
570	# named items already in import. Hence we put them in a folder first
571	# (to which the html links properly) and that will allow
572	# replace_srcdoc_with_html.pl to move them safely to /import.
573
574	# To do all this, we need to use wvWare's --dir and --basename options
575	# where dir is the full path to the image folder directory and
576	# basename is the full path to the image folder appended to the name
577	# which is to be prepended to every image file:
578	# eg. if the images were to have names like sample0.jpg to sampleN.jpg,
579	# then the basename is "/full/path/to/imgdir/sample".
580	# In this case, basename is the full path to and name of the document.
581	# HOWEVER: basename always takes full path, not relative url, so
582	# the greenstone browser is unable to display the images (absolute paths
583	# cause it to give an "external link" message)
584	# See http://osdir.com/ml/lib.wvware.devel/2002-11/msg00014.html
585	# and http://rpmfind.net/linux/RPM/freshmeat/rpms/wv/wv-0.5.44-1.i386.html
586	# "added --dir option to wvHtml so that pictures can be placed in
587	# a seperate directory"
588	# "running wvWare through IMP to view word documents as html. It gets
589	# invoked like this:
590	# wvWare --dir=/tmp-wvWare --basename=/tmp-wvWare/img$$- $tmp_word >$tmp_output"
591
592	# toppath is the folder where html is generated
593	# docname is the name (without extension) of the html to be generated
594	# suffix (extension) is thrown away
595	my ($docname, $toppath)
596	= &File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
597
598	# We want the image folder generated to have the same name as windows
599	# would generate ($windows_scripting) when it converts from word to html.
600	# That is, foldername=docname_files
601	my $assoc_dir = &util::filename_cat($toppath, $docname."_files");
602	#print "assoc_dir: ".$assoc_dir."\n"; # same as "$output_filestem._files"
603
604	# ensure this image directory exists
605	# if it exists already, just delete and recreate
606	if(-e $assoc_dir) {
607	&util::rm_r($assoc_dir);
608	}
609	&util::mk_dir($assoc_dir);
610
611	# the images are all going to be called image0, image1,..., imageN
612	my $img_basenames = &util::filename_cat($assoc_dir, $docname);
613
614	#print STDERR "**toppath: $toppath\n**docname: $docname\n;
615	#print STDERR "****img_basenames: $img_basenames\n" if($img_basenames);
616	#print STDERR "****assoc_dir: $assoc_dir\n" if($assoc_dir);
617
618	my $cmd = "";
619	if ($timeout) {$cmd = "ulimit -t $timeout;";}
620	# wvWare's --dir and --basename options for image directory.
621	# Replaced the next line with the 2 lines following it:
622	# $cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
623	$cmd .= "$wvWare --dir \"$assoc_dir\" --basename \"$img_basenames\"";
624	$cmd .= " --charset utf-8 --config \"$wv_conf\"";
625	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
626
627	# redirecting STDERR is a bad idea on windows 95/98
628	$cmd .= " 2> \"$output_filestem.err\""
629	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
630	# execute the command
631	$!=0;
632	if (system($cmd)!=0)
633	{
634	print STDERR "Error executing wv converter:$!\n";
635	if (-s "$output_filestem.err") {
636	open (ERRFILE, "<$output_filestem.err");
637
638	my $write_to_fail_log=0;
639	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
640	{$write_to_fail_log=1;}
641
642	my $line;
643	while ($line=<ERRFILE>) {
644	if ($line =~ m/\w/) {
645	print STDERR "$line";
646	print FAILLOG "$line" if ($write_to_fail_log);
647	}
648	if ($line !~ m/startup error/) {next;}
649	print STDERR " (given an invalid .DOC file?)\n";
650	print FAILLOG " (given an invalid .DOC file?)\n"
651	if ($write_to_fail_log);
652
653	} # while ERRFILE
654	close FAILLOG if ($write_to_fail_log);
655	}
656	return 0; # we can try any_to_text
657	}
658
659	# Was the conversion successful?
660
661	if (-s "$output_filestem.html") { # if file has non-zero size (i.e. it has contents)
662	open(TMP, "$output_filestem.html");
663	my $line = <TMP>;
664	close(TMP);
665	if ($line && $line =~ m/DOCTYPE HTML/) {
666	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
667
668	# Inserted this code to remove the images directory if it was still empty after
669	# the html was generated (in case there were no images in the word document)
670	if (&util::is_dir_empty($assoc_dir)) {
671	#print STDERR "*gsConvert.pl: Image dir $assoc_dir is empty, removing*\n";
672	&util::rm_r($assoc_dir);
673	} else { # there was an image folder (it was generated)
674	# Therefore, the html file generated contains absolute links to the images
675	# Replace them with relative links instead, so the folder can be moved elsewhere
676	&make_links_to_assocdir_relative($toppath, $docname, "$output_filestem.html", $assoc_dir, $docname."_files");
677	}
678	return 1;
679	}
680	}
681
682	# If here, an error of some sort occurred
683	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
684	if (-e "$output_filestem.err") {
685	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
686	open (ERRLOG,"$output_filestem.err");
687	while (<ERRLOG>) {print FAILLOG $_;}
688	close FAILLOG;
689	close ERRLOG;
690	}
691	&util::rm("$output_filestem.err");
692	}
693
694	return 0;
695	}
696
697	# Method to work with doc_to_html - Word docs might contain images.
698	# When such word docs are converted with wvWare, we make it generate a
699	# <filename>_files folder with the associated images, while the html file
700	# <filename> refers to the images using absolute paths to <filename>_files.
701	# This method reads in that html file and replaces all the absolute paths to
702	# the images in <filename>_files with the relative paths to the images from
703	# that folder. (I.e. with <filename>_files/<imagename.ext>).
704	sub make_links_to_assocdir_relative{
705	# toppath is the top-level folder in which the html file we're going to be fixing resides
706	# docname is just the name (without extension) of the html file
707	# html_file is the full path to the html file: /full/path/docname.html
708	# assoc_dir_path is toppath/docname_files
709	# assoc_dirname is the directory name of the folder with associated imgs: docname_files
710	my ($toppath, $docname, $html_file, $assoc_dir_path, $assoc_dirname) = @_;
711
712	# 1. Read all the contents of the html into a string
713	# open the original file for reading
714	unless(open(FIN, "<$html_file")) {
715	print STDERR "gsConvert.pl: Unable to open $html_file for reading absolute urls...ERROR: $!\n";
716	return 0;
717	}
718	# From http://perl.plover.com/local.html
719	# "It's cheaper to read the file all at once, without all the splitting and reassembling.
720	# (Some people call this slurping the file.) Perl has a special feature to support this:
721	# If the $/ variable is undefined, the <...> operator will read the entire file all at once"
722	my $html_contents;
723	{
724	local $/ = undef; # Read entire file at once
725	$html_contents = <FIN>; # Now file is read in as one single 'line'
726	}
727	close(FIN); # close the file
728	#print STDERR $html_contents;
729
730	# 2. Replace (substitute) all ocurrences of the assoc_dir_path in a hrefs and img src
731	# values with assoc_dirname
732	# At the end: g means substitute all occurrences (global), while s at the end means treat
733	# all new lines as a regular space. This interacts with g to consider all the lines
734	# together as a single line so that multi-occurrences can be replaced.
735
736	# we can't just replace $assoc_dir_path with $assoc_dir
737	# $assoc_dir_path represents a regular expression that needs to be replaced
738	# if it contains ., -, [, ], or Windows style backslashes in paths -- which all have special
739	# meaning in Perl regular expressions -- we need to escape these first
740	my $safe_reg_expression = $assoc_dir_path;
741	$safe_reg_expression =~ s/\\/\\\\/g;
742	$safe_reg_expression =~ s/\./\\./g;
743	$safe_reg_expression =~ s/\-/\\-/g;
744	$safe_reg_expression =~ s/\[/\\[/g;
745	$safe_reg_expression =~ s/\]/\\]/g;
746	$safe_reg_expression =~ s/ /%20/g; # wvWare put %20 in place of space, so we need to change our prefix to match
747
748	# The following regular expression substitution looks for <a or <image, followed by any other
749	# attributes and values until it comes to the FIRST (indicated by ?) href= or src=
750	# followed by " or ' no quotes at all around path, followed by the associated folder's pathname
751	# followed by characters (for the img filename), then finally the optional closing quotes
752	# in " or ' form, followed by any other attributes and values until the first > to end the tag.
753	# The substitution: all the parts preceding associated folder's pathname are retained,
754	# the associated folder path name is replaced by associated folder directory name
755	# and the rest upto and including the closing > tag is retained.
756	# The sg at the end of the pattern match treats all of html_contents as a single line (s)
757	# and performs a global replace (g) meaning that all occurrences that match in that single line
758	# are substituted.
759	$html_contents =~ s/(<(a\|img).?(href\|src)=(\"\|\')?)$safe_reg_expression(.?(\"\|\')?.*?>)/$1$assoc_dirname$5/sg;
760	#$html_contents =~ s/$safe_reg_expression/$assoc_dirname/gs; # this works, used as fall-back
761	# now replace any %20 chars in filenames of href or src attributes to use literal space ' '. Calls a function for this
762	$html_contents =~ s/(<(a\|img).?(href\|src)=(\"\|\')?)(.)(.?(\"\|\')?.?>)/&post_process_assocfile_urls($1, $5, $6)/sge;
763
764	#print STDERR "**assoc_dirname: $assoc_dirname*\n";
765	#print STDERR "**safe_reg_expression: $safe_reg_expression*\n";
766
767	# delete the original file and recreate it
768	my $copy_of_filename = $html_file;
769	&util::rm($copy_of_filename); # deleted the file
770
771	# Recreate the original file for writing the updated contents
772	unless(open(FOUT, ">$html_file")) { # open it as a new file for writing
773	print STDERR "gsConvert.pl: Unable to open $html_file for writing relative links...ERROR: $!\n";
774	return 0;
775	}
776
777	# write out the updated contents and close the file
778	print FOUT $html_contents;
779	close(FOUT);
780	return 1;
781	}
782
783	# Utility routine to make sure HTML plugin gets img src/href link pathnames that contain
784	# url slashes (/) instead of windows-style backwards slashes, and to convert all %20
785	# introduced in link pathnames by wvWare into space again. Converts all percent signs
786	# introduced by URL encoding filenames generated into %25 in these url links referencing them
787	sub post_process_assocfile_urls
788	{
789	my ($pre, $text, $post) = @_;
790
791	$text =~ s/%20/ /g; # Convert %20s to space and not underscore since underscores mess with incremental rebuild
792	# $text =~ s/%20/_/g; # reinstated this line, since we no longer replace spaces with %20. We replace them with underscores
793	$text =~ s/\\/\//g;
794	$text =~ s/%/%25/g;
795
796	return "$pre$text$post";
797	}
798
799	# Attempt to convert a word document to html with the word2html scripting program
800	sub native_doc_to_html {
801	my ($input_filename, $output_filestem) = @_;
802
803	my $vbScript = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
804	$ENV{'GSDLOS'}, "word2html");
805
806	$vbScript = "word2html" if ($ENV{'GSDLOS'} =~ m/^windows$/i);
807	if (-e "$output_filestem.html") {
808	print STDERR " The conversion file:\n";
809	print STDERR " $output_filestem.html\n";
810	print STDERR " ... already exists. Skipping\n";
811	return 1;
812	}
813
814	my $cmd = "";
815	if ($timeout) {$cmd = "ulimit -t $timeout;";}
816	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
817	#$cmd .= "$vbScript $input_filename $output_filestem.html";
818	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
819
820	# redirecting STDERR
821	$cmd .= " 2> \"$output_filestem.err\""
822	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
823
824	# execute the command
825	$!=0;
826	if (system($cmd)!=0)
827	{
828	print STDERR "Error executing word2Html converter:$!\n";
829	if (-s "$output_filestem.err") {
830	open (ERRFILE, "<$output_filestem.err");
831
832	my $write_to_fail_log=0;
833	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
834	{$write_to_fail_log=1;}
835
836	my $line;
837	while ($line=<ERRFILE>) {
838	if ($line =~ m/\w/) {
839	print STDERR "$line";
840	print FAILLOG "$line" if ($write_to_fail_log);
841	}
842	if ($line !~ m/startup error/) {next;}
843	print STDERR " (given an invalid .DOC file?)\n";
844	print FAILLOG " (given an invalid .DOC file?)\n"
845	if ($write_to_fail_log);
846
847	} # while ERRFILE
848	close FAILLOG if ($write_to_fail_log);
849	}
850	return 0; # we can try any_to_text
851	}
852
853	# Was the conversion successful?
854	if (-s "$output_filestem.html") {
855	open(TMP, "$output_filestem.html");
856	my $line = <TMP>;
857	close(TMP);
858	if ($line && $line =~ m/html/i) {
859	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
860	return 1;
861	}
862	}
863
864	# If here, an error of some sort occurred
865	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
866	if (-e "$output_filestem.err") {
867	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
868	open (ERRLOG,"$output_filestem.err");
869	while (<ERRLOG>) {print FAILLOG $_;}
870	close FAILLOG;
871	close ERRLOG;
872	}
873	&util::rm("$output_filestem.err");
874	}
875	return 0;
876	}
877
878	# Attempt to convert a word document to html with JODConvert scripting program
879	sub openoffice_doc_to_html {
880	my ($input_filename, $output_filestem) = @_;
881
882	if (-e "$output_filestem.html") {
883	print STDERR " The conversion file:\n";
884	print STDERR " $output_filestem.html\n";
885	print STDERR " ... skipping\n";
886	return 1;
887	}
888
889	my $oo_script_dir = &util::filename_cat($ENV{'GEXT_OPENOFFICE'}, "bin", "script");
890	my $oo2html = &util::filename_cat($oo_script_dir,"oo2html");
891	if (!-e $oo2html) {
892	print STDERR "Error: Unable to find 'oo2html' in: \n";
893	print STDERR " $oo_script_dir\n";
894	print STDERR " Is the OpenOffice extension to Greenstone installed?\n";
895	return 0;
896	}
897
898	my $cmd = "";
899	if ($timeout) {$cmd = "ulimit -t $timeout;";}
900	$cmd .= "$oo2html \"$input_filename\" \"$output_filestem.html\"";
901
902	# redirecting STDERR
903	$cmd .= " 2> \"$output_filestem.err\""
904	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
905
906	# execute the command
907	$!=0;
908	if (system($cmd)!=0)
909	{
910	print STDERR "Error executing oo2html converter: $!\n";
911	print STDERR "Command was: $cmd\n";
912
913	if (-s "$output_filestem.err") {
914	open (ERRFILE, "<$output_filestem.err");
915
916	my $write_to_fail_log=0;
917	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
918	{$write_to_fail_log=1;}
919
920	my $line;
921	while ($line=<ERRFILE>) {
922	if ($line =~ m/\w/) {
923	print STDERR "$line";
924	print FAILLOG "$line" if ($write_to_fail_log);
925	}
926	if ($line !~ m/startup error/) {next;}
927	print STDERR " (given an invalid .DOC file?)\n";
928	print FAILLOG " (given an invalid .DOC file?)\n"
929	if ($write_to_fail_log);
930
931	} # while ERRFILE
932	close FAILLOG if ($write_to_fail_log);
933	}
934	return 0; # we can try any_to_text
935	}
936
937	# Was the conversion successful?
938	if (-s "$output_filestem.html") {
939	open(TMP, "$output_filestem.html");
940	my $line = <TMP>;
941	close(TMP);
942	if ($line && $line =~ m/html/i) {
943	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
944	return 1;
945	}
946	}
947
948	# If here, an error of some sort occurred
949
950	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
951	if (-e "$output_filestem.err") {
952	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
953	open (ERRLOG,"$output_filestem.err");
954	while (<ERRLOG>) {print FAILLOG $_;}
955	close FAILLOG;
956	close ERRLOG;
957	}
958	&util::rm("$output_filestem.err");
959	}
960	return 0;
961	}
962
963	# Attempt to convert an RTF document to html with rtftohtml
964	sub rtf_to_html {
965	my ($input_filename, $output_filestem) = @_;
966
967	# formulate the command
968	my $cmd = "";
969	if ($timeout) {$cmd = "ulimit -t $timeout;";}
970	$cmd .= "rtftohtml";
971	#$cmd .= "rtf-converter";
972
973	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
974
975	$cmd .= " 2>\"$output_filestem.err\""
976	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
977
978
979	# execute the command
980	$!=0;
981	if (system($cmd)!=0)
982	{
983	print STDERR "Error executing rtf converter $!\n";
984	# don't currently bother printing out error log...
985	# keep going, in case it still created an HTML file...
986	}
987
988	# Was the conversion successful?
989	my $was_successful=0;
990	if (-s "$output_filestem.html") {
991	# make sure we have some content other than header
992	open (HTML, "$output_filestem.html"); # what to do if fail?
993	my $line;
994	my $past_header=0;
995	while ($line=<HTML>) {
996
997	if ($past_header == 0) {
998	if ($line =~ m/<body>/) {$past_header=1;}
999	next;
1000	}
1001
1002	$line =~ s/<[^>]+>//g;
1003	if ($line =~ m/\w/ && $past_header) { # we found some content...
1004	$was_successful=1;
1005	last;
1006	}
1007	}
1008	close HTML;
1009	}
1010
1011	if ($was_successful) {
1012	&util::rm("$output_filestem.err")
1013	if (-e "$output_filestem.err");
1014	# insert the (modified) table of contents, if it exists.
1015	if (-e "${output_filestem}_ToC.html") {
1016	&util::mv("$output_filestem.html","$output_filestem.src");
1017	my $open_failed=0;
1018	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
1019	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
1020	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
1021
1022	if ($open_failed) {
1023	close HTMLSRC;
1024	close TOC;
1025	close HTML;
1026	&util::mv("$output_filestem.src","$output_filestem.html");
1027	return 1;
1028	}
1029
1030	# print out header info from src html.
1031	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
1032	print HTML "$_";
1033	}
1034
1035	# print out table of contents, making links relative
1036	<TOC>; <TOC>; # ignore first 2 lines
1037	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
1038	my $line;
1039	while ($line=<TOC>) {
1040	$line =~ s@</body></html>$@@i ; # only last line has this
1041	# make link relative
1042	$line =~ s@href=\"[^\#]+@href=\"@i;
1043	print HTML $line;
1044	}
1045	close TOC;
1046
1047	# rest of html src
1048	while (<HTMLSRC>) {
1049	print HTML $_;
1050	}
1051	close HTMLSRC;
1052	close HTML;
1053
1054	&util::rm("${output_filestem}_ToC.html");
1055	&util::rm("${output_filestem}.src");
1056	}
1057	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
1058	return 1; # success
1059	}
1060
1061	if (-e "$output_filestem.err") {
1062	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1063	{
1064	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
1065	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
1066	print FAILLOG " (rtf file might be too recent):\n";
1067	open (ERRLOG, "$output_filestem.err");
1068	while (<ERRLOG>) {print FAILLOG $_;}
1069	close ERRLOG;
1070	close FAILLOG;
1071	}
1072	&util::rm("$output_filestem.err");
1073	}
1074
1075	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1076
1077	return 0;
1078	}
1079
1080
1081	# Convert a pdf file to html with the pdftohtml command
1082
1083	sub pdf_to_html {
1084	my ($dirname, $input_filename, $output_filestem) = @_;
1085
1086	my $cmd = "";
1087	if ($timeout) {$cmd = "ulimit -t $timeout;";}
1088	$cmd .= "perl -S pdftohtml.pl -zoom $pdf_zoom";
1089	$cmd .= " -c" if ($pdf_complex);
1090	$cmd .= " -i" if ($pdf_ignore_images);
1091	$cmd .= " -a" if ($pdf_allow_images_only);
1092	$cmd .= " -hidden" unless ($pdf_nohidden);
1093	$cmd .= " \"$input_filename\" \"$output_filestem\"";
1094
1095	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
1096	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1097	} else {
1098	$cmd .= " > \"$output_filestem.err\"";
1099	}
1100
1101	$!=0;
1102
1103	my $retval=system($cmd);
1104	if ($retval!=0)
1105	{
1106	print STDERR "Error executing pdftohtml.pl";
1107	if ($!) {print STDERR ": $!";}
1108	print STDERR "\n";
1109	}
1110
1111	# make sure the converter made something
1112	if ($retval!=0 \|\| ! -s "$output_filestem.html")
1113	{
1114	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1115	# print out the converter's std err, if any
1116	if (-s "$output_filestem.err") {
1117	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1118	print STDERR "pdftohtml error log:\n";
1119	while (<ERRLOG>) {
1120	print STDERR "$_";
1121	}
1122	close ERRLOG;
1123	}
1124	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1125	if (-e "$output_filestem.err") {
1126	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1127	{
1128	open (ERRLOG, "$output_filestem.err");
1129	while (<ERRLOG>) {print FAILLOG $_;}
1130	close ERRLOG;
1131	close FAILLOG;
1132	}
1133	&util::rm("$output_filestem.err");
1134	}
1135	return 0;
1136	}
1137
1138	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1139	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1140	return 1;
1141	}
1142
1143	# Convert a pdf file to various types of image with the convert command
1144
1145	sub pdfps_to_img {
1146	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
1147
1148	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
1149	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
1150	my $result = `identify 2>&1`;
1151	if ($? == -1 \|\| $? == 256) { # Linux and Windows return different values for "program not found"
1152	#ImageMagick is not installed, thus the convert utility is not available.
1153	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images\n";
1154	return 0;
1155	}
1156	}
1157
1158	my $cmd = "";
1159	if ($timeout) {$cmd = "ulimit -t $timeout;";}
1160	$output_type =~ s/.\_(.)/$1/i;
1161	$cmd .= "perl -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
1162	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
1163	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1164	} else {
1165	$cmd .= " > \"$output_filestem.err\"";
1166	}
1167
1168	# don't include path on windows (to avoid having to play about
1169	# with quoting when GSDLHOME might contain spaces) but assume
1170	# that the PATH is set up correctly
1171	$!=0;
1172	my $retval=system($cmd);
1173	if ($retval!=0)
1174	{
1175	print STDERR "Error executing pdftoimg.pl";
1176	if ($!) {print STDERR ": $!";}
1177	print STDERR "\n";
1178	}
1179
1180	#make sure the converter made something
1181	#if ($retval !=0) \|\| ! -s "$output_filestem")
1182	if ($retval !=0)
1183	{
1184	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1185	#print out the converter's std err, if any
1186	if (-s "$output_filestem.err") {
1187	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1188	print STDERR "pdfpstoimg error log:\n";
1189	while (<ERRLOG>) {
1190	print STDERR "$_";
1191	}
1192	close ERRLOG;
1193	}
1194	#&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
1195	if (-e "$output_filestem.err") {
1196	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1197	{
1198	open (ERRLOG, "$output_filestem.err");
1199	while (<ERRLOG>) {print FAILLOG $_;}
1200	close ERRLOG;
1201	close FAILLOG;
1202	}
1203	&util::rm("$output_filestem.err");
1204	}
1205	return 0;
1206	}
1207	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1208	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1209	return 1;
1210	}
1211
1212	# Convert a PDF file to text with the pdftotext command
1213
1214	sub pdf_to_text {
1215	my ($dirname, $input_filename, $output_filestem) = @_;
1216
1217	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
1218
1219	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
1220	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
1221	} else {
1222	$cmd .= " > \"$output_filestem.err\"";
1223	}
1224
1225	if (system($cmd)!=0)
1226	{
1227	print STDERR "Error executing $cmd: $!\n";
1228	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1229	}
1230
1231	# make sure there is some extracted text.
1232	if (-e "$output_filestem.text") {
1233	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
1234	binmode(EXTR_TEXT); # just in case...
1235	my $line="";
1236	my $seen_text=0;
1237	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
1238	if ($line=~ m/\w/) {$seen_text=1;}
1239	}
1240	close EXTR_TEXT;
1241	if ($seen_text==0) { # no text was extracted
1242	print STDERR "Error: pdftotext found no text\n";
1243	&util::rm("$output_filestem.text");
1244	}
1245	}
1246
1247	# make sure the converter made something
1248	if (! -s "$output_filestem.text")
1249	{
1250	# print out the converters std err, if any
1251	if (-s "$output_filestem.err") {
1252	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
1253	print STDERR "pdftotext error log:\n";
1254	while (<ERRLOG>) {
1255	print STDERR "$_";
1256	}
1257	close ERRLOG;
1258	}
1259	# does this converter create a .out file?
1260	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
1261	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1262	if (-e "$output_filestem.err") {
1263	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
1264	{
1265	open (ERRLOG,"$output_filestem.err");
1266	while (<ERRLOG>) {print FAILLOG $_;}
1267	close ERRLOG;
1268	close FAILLOG;
1269	}
1270	&util::rm("$output_filestem.err");
1271	}
1272	return 0;
1273	}
1274	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1275	return 1;
1276	}
1277
1278	# Convert a PostScript document to text
1279	# note - just using "ps2ascii" isn't good enough, as it
1280	# returns 0 for a postscript interpreter error. ps2ascii is just
1281	# a wrapper to "gs" anyway, so we use that cmd here.
1282
1283	sub ps_to_text {
1284	my ($input_filename, $output_filestem) = @_;
1285
1286	my $error = "";
1287
1288	# if we're on windows we'll fall straight through without attempting
1289	# to use gs
1290	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
1291	$error = "Windows does not support gs";
1292
1293	} else {
1294	my $cmd = "";
1295	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
1296	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
1297	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
1298	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
1299	$cmd .= " 2> $output_filestem.err";
1300	$!=0;
1301
1302	my $retcode=system($cmd);
1303	$retcode = $? >> 8; # see man perlfunc - system for this...
1304	# if system returns -1 \| 127 (couldn't start program), look at $! for message
1305
1306	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1307	elsif (! -e "$output_filestem.text") {
1308	$error="did not create output file.\n";
1309	}
1310	else
1311	{ # make sure the interpreter didn't get an error. It is technically
1312	# possible for the actual text to start with this, but....
1313	open PSOUT, "$output_filestem.text";
1314	if (<PSOUT> =~ m/^Error: (.*)/) {
1315	$error="interpreter error - \"$1\"";
1316	}
1317	close PSOUT;
1318	}
1319	}
1320
1321	if ($error ne "")
1322	{
1323	print STDERR "Warning: Error executing gs: $error\n";
1324	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1325
1326	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1327	{
1328	print FAILLOG "gs - $error\n";
1329	if (-e "$output_filestem.err") {
1330	open(ERRLOG, "$output_filestem.err");
1331	while (<ERRLOG>) {print FAILLOG $_;}
1332	close ERRLOG;
1333	}
1334	close FAILLOG;
1335	}
1336	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1337
1338
1339	# Fine then. We'll just do a lousy job by ourselves...
1340	# Based on 5-line regexp sed script found at:
1341	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1342	#
1343	print STDERR "Stripping text from postscript\n";
1344	my $errorcode=0;
1345	open (IN, "$input_filename")
1346	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1347	open (OUT, ">$output_filestem.text")
1348	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1349	if ($errorcode) {print STDERR "errors\n";return 0;}
1350
1351	my $text=""; # this is for whole .ps file...
1352	$text = join('', <IN>); # see man perlport, under "System Resources"
1353	close IN;
1354
1355	# Make sure this is a ps file...
1356	if ($text !~ m/^%!/) {
1357	print STDERR "Bad postscript header: not '%!'\n";
1358	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1359	{
1360	print FAILLOG "Bad postscript header: not '%!'\n";
1361	close FAILLOG;
1362	}
1363	return 0;
1364	}
1365
1366	# if ps has Page data, then use it to delete all stuff before it.
1367	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1368
1369	# remove all leading non-data stuff
1370	$text =~ s/^.*?\(//s;
1371
1372	# remove all newline chars for easier processing
1373	$text =~ s/\n//g;
1374
1375	# Big assumption here - assume that if any co-ordinates are
1376	# given, then we are at the end of a sentence.
1377	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1378
1379	# special characters--
1380	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1381
1382	# ? ps text formatting (eg italics?) ?
1383	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1384	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1385	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1386	# default - remove the rest
1387	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1388
1389	# attempt to add whitespace between words...
1390	# this is based purely on observation, and may be completely wrong...
1391	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1392	# eg I notice "b(" is sometimes NOT a space if preceded by a
1393	# negative number.
1394	$text =~ s/\)\d+ ?b\(/\) \( /g;
1395
1396	# change quoted braces to brackets
1397	$text =~ s/([^\\])\\\(/$1\{/g;
1398	$text =~ s/([^\\])\\\)/$1\}/g ;
1399
1400	# remove everything that is not between braces
1401	$text =~ s/\)([^\(\)])+?\(//sg ;
1402
1403	# remove any Trailer eof stuff.
1404	$text =~ s/\)[^\)]*$//sg;
1405
1406	### ligatures have special characters...
1407	$text =~ s/\\013/ff/g;
1408	$text =~ s/\\014/fi/g;
1409	$text =~ s/\\015/fl/g;
1410	$text =~ s/\\016/ffi/g;
1411	$text =~ s/\\214/fi/g;
1412	$text =~ s/\\215/fl/g;
1413	$text =~ s/\\017/\n\* /g; # asterisk?
1414	$text =~ s/\\023/\023/g; # e acute ('e)
1415	$text =~ s/\\177/\252/g; # u"
1416	# $text =~ s/ ?? /\344/g; # a"
1417
1418	print OUT "$text";
1419	close OUT;
1420	}
1421	# wrap the text - use a minimum length. ie, first space after this length.
1422	my $wrap_length=72;
1423	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
1424	open INFILE, "$output_filestem.text.tmp" \|\|
1425	die "Couldn't open file: $!";
1426	open OUTFILE, ">$output_filestem.text" \|\|
1427	die "Couldn't open file for writing: $!";
1428	my $line="";
1429	while ($line=<INFILE>) {
1430	while (length($line)>0) {
1431	if (length($line)>$wrap_length) {
1432	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1433	print OUTFILE "$1\n";
1434	} else {
1435	print OUTFILE "$line";
1436	$line="";
1437	}
1438	}
1439	}
1440	close INFILE;
1441	close OUTFILE;
1442	&util::rm("$output_filestem.text.tmp");
1443
1444	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
1445	return 1;
1446	}
1447
1448
1449	# Convert any file to HTML with a crude perl implementation of the
1450	# UNIX strings command.
1451
1452	sub any_to_html {
1453	my ($input_filename, $output_filestem) = @_;
1454
1455	# First generate a text file
1456	return 0 unless (&any_to_text($input_filename, $output_filestem));
1457
1458	# create an HTML file from the text file
1459	open(TEXT, "<$output_filestem.text");
1460	open(HTML, ">$output_filestem.html");
1461
1462	print HTML "<html><head>\n";
1463	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1464	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1465	print HTML "</head><body>\n\n";
1466
1467	my $line;
1468	while ($line=<TEXT>) {
1469	$line =~ s/</</g;
1470	$line =~ s/>/>/g;
1471	if ($line =~ m/^\s*$/) {
1472	print HTML "<p>";
1473	} else {
1474	print HTML "<br> ", $line;
1475	}
1476	}
1477	print HTML "\n</body></html>\n";
1478
1479	close HTML;
1480	close TEXT;
1481
1482	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
1483	return 1;
1484	}
1485
1486	# Convert any file to TEXT with a crude perl implementation of the
1487	# UNIX strings command.
1488	# Note - this assumes ascii charsets :( (jrm21)
1489
1490	sub any_to_text {
1491	my ($input_filename, $output_filestem) = @_;
1492
1493	if (!$use_strings) {
1494	return 0;
1495	}
1496
1497	print STDERR "\n** In any to text**\n\n";
1498	open(IN, "<$input_filename") \|\| return 0;
1499	binmode(IN);
1500	open(OUT, ">$output_filestem.text") \|\| return 0;
1501
1502	my ($line);
1503	my $output_line_count = 0;
1504	while (<IN>) {
1505	$line = $_;
1506
1507	# delete anything that isn't a printable character
1508	$line =~ s/[^\040-\176]+/\n/sg;
1509
1510	# delete any string less than 10 characters long
1511	$line =~ s/^.{0,9}$/\n/mg;
1512	while ($line =~ m/^.{1,9}$/m) {
1513	$line =~ s/^.{0,9}$/\n/mg;
1514	$line =~ s/\n+/\n/sg;
1515	}
1516
1517	# remove extraneous whitespace
1518	$line =~ s/\n+/\n/gs;
1519	$line =~ s/^\n//gs;
1520
1521	# output whatever is left
1522	if ($line =~ m/[^\n ]/) {
1523	print OUT $line;
1524	++$output_line_count;
1525	}
1526	}
1527
1528	close OUT;
1529	close IN;
1530
1531	if ($output_line_count) { # try to protect against binary only formats
1532	return 1;
1533	}
1534
1535	&util::rm("$output_filestem.text");
1536	return 0;
1537
1538	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: