Context Navigation

source: main/trunk/greenstone2/bin/script/gsConvert.pl@ 30681

Last change on this file since 30681 was 30681, checked in by ak19, 8 years ago
3 new strings introduced by Kathy contained the :, which is used as a separator in the properties file. Although Kathy tried to escape it with a backslash, it broke GTI because GTI doesn't recognise the backslash as a separator and all kinds of weird things happened from then on, so that the Gujarati translator kept having to translte the current date rather than a real GS3 interface string. Modified the gti.pl code (to be committed) and the new strings that Kathy introduced, so that hopefully, GTI can now handle it. Property names and values will be split at the right-most separator character now (= or :) and any on the left should not be escaped.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 36.6 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML or TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999-2002 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. The sources of these are usually found
31	# in the $GSDLHOME/packages directory, and the executables should live in
32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
33	#
34	# Currently, we can convert the following formats by using external
35	# conversion utilities:
36	# Microsoft Word (versions 2,6,7 [==95?], 8[==97?], 9[==2000?]), RTF,
37	# Adobe PDF, PostScript, MS PowerPoint (95 and 97), and MS Excel (95 and 97).
38	#
39	# We can try to convert any file to text with a perl implementation of the
40	# UNIX strings command.
41	#
42	# We try to convert Postscript files to text using "gs" which is often on
43	# *nix machines. We fall back to performing weak text extraction by using
44	# regular expressions.
45
46	BEGIN {
47	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
48	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
49	}
50
51	use strict;
52
53	use parsargv;
54	use util;
55	use FileUtils;
56	use Cwd;
57
58	# Are we running on WinNT or Win2000 (or later)?
59	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
60	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
61
62	my $use_strings;
63	my $pdf_complex;
64	my $pdf_nohidden;
65	my $pdf_zoom;
66	my $pdf_ignore_images;
67	my $pdf_allow_images_only;
68	my $windows_scripting;
69
70	sub print_usage
71	{
72	print STDERR "\n";
73	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
74	print STDERR " or text using third-party programs.\n\n";
75	print STDERR " usage: $0 [options] filename\n";
76	print STDERR " options:\n\t-type\tdoc\|dot\|pdf\|ps\|ppt\|rtf\|xls\t(input file type)\n";
77	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
78	print STDERR "\t-output\tauto\|html\|text\|pagedimg_jpg\|pagedimg_gif\|pagedimg_png\t(output file type)\n";
79	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
80	print STDERR "\t-use_strings\tuse strings to extract text if conversion fails\n";
81	print STDERR "\t-windows_scripting\tuse windows VB script (if available) to convert Microsoft Word and PPT documents\n";
82	print STDERR "\t-pdf_complex\tuse complex output when converting PDF to HTML\n";
83	print STDERR "\t-pdf_nohidden\tDon't attempt to extract hidden text from PDF files\n";
84	print STDERR "\t-pdf_ignore_images\tdon't attempt to extract images when\n";
85	print STDERR "\t\tconverting PDF to HTML\n";
86	print STDERR "\t-pdf_allow_images_only\tallow images only (continue even if no text is present when converting to HTML)\n";
87	print STDERR "\t-pdf_zoom\tfactor by which to zoom PDF (only useful if\n";
88	print STDERR "\t\t-pdf_complex is set\n";
89	exit(1);
90	}
91
92	my $faillogfile="";
93	my $timeout=0;
94	my $verbosity=0;
95
96	sub main
97	{
98	my (@ARGV) = @_;
99	my ($input_type,$output_type,$verbose);
100
101	# Dynamically figure out what the --type option can support, based on whether -windows_scripting
102	# is in use or not
103	my $default_type_re = "(doc\|dot\|pdf\|ps\|ppt\|rtf\|xls)";
104	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
105	#my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xlsx?)";
106	# Currently only have VBA for Word and PPT(but no XLS)
107	my $enhanced_type_re = "(docx?\|dot\|pdf\|ps\|pptx?\|rtf\|xls)";
108
109	my $type_re = $default_type_re;
110
111	foreach my $a (@ARGV) {
112	if ($a =~ m/^windows_scripting$/i) {
113	$type_re = $enhanced_type_re;
114	}
115	}
116
117	# read command-line arguments
118	if (!parsargv::parse(\@ARGV,
119	"type/$type_re/", \$input_type,
120	'/errlog/.*/', \$faillogfile,
121	'output/(auto\|html\|text\|pagedimg).*/', \$output_type,
122	'timeout/\d+/0',\$timeout,
123	'verbose/\d+/0', \$verbose,
124	'windows_scripting',\$windows_scripting,
125	'use_strings', \$use_strings,
126	'pdf_complex', \$pdf_complex,
127	'pdf_ignore_images', \$pdf_ignore_images,
128	'pdf_allow_images_only', \$pdf_allow_images_only,
129	'pdf_nohidden', \$pdf_nohidden,
130	'pdf_zoom/\d+/2', \$pdf_zoom
131	))
132	{
133	print_usage();
134	}
135
136	$verbosity=$verbose if defined $verbose;
137
138	# Make sure the input file exists and can be opened for reading
139	if (scalar(@ARGV!=1)) {
140	print_usage();
141	}
142
143	my $input_filename = $ARGV[0];
144	if (!-r $input_filename) {
145	print STDERR "Error: unable to open $input_filename for reading\n";
146	exit(1);
147	}
148
149	# Deduce filenames
150	my ($tailname,$dirname,$suffix)
151	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
152	my $output_filestem = &FileUtils::filenameConcatenate($dirname, "$tailname");
153
154	if ($input_type eq "")
155	{
156	$input_type = lc (substr($suffix,1,length($suffix)-1));
157	}
158
159	# Change to temporary working directory
160	my $stored_dir = cwd();
161	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
162
163	# Select convert utility
164	if (!defined $input_type) {
165	print STDERR "Error: No filename extension or input type defined\n";
166	exit(1);
167	}
168	elsif ($input_type =~ m/^docx?$/ \|\| $input_type eq "dot") {
169	print &convertDOC($input_filename, $output_filestem, $output_type);
170	print "\n";
171	}
172	elsif ($input_type eq "rtf") {
173	print &convertRTF($input_filename, $output_filestem, $output_type);
174	print "\n";
175	}
176	elsif ($input_type eq "pdf") {
177	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
178	print "\n";
179	}
180	elsif ($input_type eq "ps") {
181	print &convertPS($dirname, $input_filename, $output_filestem, $output_type);
182	print "\n";
183	}
184	elsif ($input_type =~ m/pptx?$/) {
185	print &convertPPT($input_filename, $output_filestem, $output_type);
186	print "\n";
187	}
188	elsif ($input_type =~ m/xlsx?$/) {
189	print &convertXLS($input_filename, $output_filestem, $output_type);
190	print "\n";
191	}
192	else {
193	print STDERR "Error: Unable to convert type '$input_type'\n";
194	exit(1);
195	}
196
197	# restore to original working directory
198	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
199
200	}
201
202	&main(@ARGV);
203
204
205
206	# Document-type conversion functions
207	#
208	# The following functions attempt to convert documents from their
209	# input type to the specified output type. If no output type was
210	# given, then they first attempt HTML, and then TEXT.
211	#
212	# Each returns the output type ("html" or "text") or "fail" if no
213	# conversion is possible.
214
215	# Convert a Microsoft word document
216
217	sub convertDOC {
218	my ($input_filename, $output_filestem, $output_type) = @_;
219
220	# Many .doc files are not in fact word documents!
221	my $realtype = &find_docfile_type($input_filename);
222
223	if ($realtype eq "word6" \|\| $realtype eq "word7"
224	\|\| $realtype eq "word8" \|\| $realtype eq "docx") {
225	return &convertWord678($input_filename, $output_filestem, $output_type);
226	} elsif ($realtype eq "rtf") {
227	return &convertRTF($input_filename, $output_filestem, $output_type);
228	} else {
229	return &convertAnything($input_filename, $output_filestem, $output_type);
230	}
231	}
232
233	# Convert a Microsoft word 6/7/8 document
234
235	sub convertWord678 {
236	my ($input_filename, $output_filestem, $output_type) = @_;
237
238	my $success = 0;
239	if (!$output_type \|\| ($output_type =~ m/html/i)){
240	if ($windows_scripting) {
241	$success = &native_doc_to_html($input_filename, $output_filestem);
242	}
243	else {
244	$success = &doc_to_html($input_filename, $output_filestem);
245	}
246	if ($success) {
247	return "html";
248	}
249	}
250	return &convertAnything($input_filename, $output_filestem, $output_type);
251	}
252
253
254	# Convert a Rich Text Format (RTF) file
255
256	sub convertRTF {
257	my ($input_filename, $output_filestem, $output_type) = @_;
258
259	my $success = 0;
260
261	# Attempt specialised conversion to HTML
262	if (!$output_type \|\| ($output_type =~ m/html/i)) {
263
264	if ($windows_scripting) {
265	$success = &native_doc_to_html($input_filename, $output_filestem);
266	}
267	else {
268	$success = &rtf_to_html($input_filename, $output_filestem);
269	}
270	if ($success) {
271	return "html";
272	}
273	}
274
275	# rtf is so ugly that's it's not worth running strings over.
276	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
277	# return &convertAnything($input_filename, $output_filestem, $output_type);
278	return "fail";
279	}
280
281
282	# Convert an unidentified file
283
284	sub convertAnything {
285	my ($input_filename, $output_filestem, $output_type) = @_;
286
287	my $success = 0;
288
289	# Attempt simple conversion to HTML
290	if (!$output_type \|\| ($output_type =~ m/html/i)) {
291	$success = &any_to_html($input_filename, $output_filestem);
292	if ($success) {
293	return "html";
294	}
295	}
296
297	# Convert to text
298	if (!$output_type \|\| ($output_type =~ m/text/i)) {
299	$success = &any_to_text($input_filename, $output_filestem);
300	if ($success) {
301	return "text";
302	}
303	}
304	return "fail";
305	}
306
307
308
309	# Convert an Adobe PDF document
310
311	sub convertPDF {
312	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
313
314	my $success = 0;
315	$output_type =~ s/.\-(.)/$1/i;
316	# Attempt coversion to Image
317	if ($output_type =~ m/jp?g\|gif\|png/i) {
318	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
319	if ($success){
320	return "item";
321	}
322	}
323
324	# Attempt conversion to HTML
325	if (!$output_type \|\| ($output_type =~ m/html/i)) {
326	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
327	if ($success) {
328	return "html";
329	}
330	}
331
332	# Attempt conversion to TEXT
333	if (!$output_type \|\| ($output_type =~ m/text/i)) {
334	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
335	if ($success) {
336	return "text";
337	}
338	}
339
340	return "fail";
341
342	}
343
344
345	# Convert an Adobe PostScript document
346
347	sub convertPS {
348	my ($dirname,$input_filename, $output_filestem, $output_type) = @_;
349
350	my $success = 0;
351	$output_type =~ s/.\-(.)/$1/i;
352	# Attempt coversion to Image
353	if ($output_type =~ m/jp?g\|gif\|png/i) {
354	$success = &pdfps_to_img($dirname, $input_filename, $output_filestem, $output_type);
355	if ($success){
356	return "item";
357	}
358	}
359
360	# Attempt conversion to TEXT
361	if (!$output_type \|\| ($output_type =~ m/text/i)) {
362	$success = &ps_to_text($input_filename, $output_filestem);
363	if ($success) {
364	return "text";
365	}
366	}
367	return "fail";
368	}
369
370
371	sub convertPPT {
372	my ($input_filename, $output_filestem, $output_type) = @_;
373	my $success = 0;
374
375	my $ppt_convert_type = "";
376
377	#if (!$output_type \|\| $windows_scripting \|\| ($output_type !~ m/html/i) \|\| ($output_type !~ m/text/i)){
378	if ($windows_scripting && ($output_type !~ m/html/i) && ($output_type !~ m/text/i)){
379	if ($output_type =~ m/gif/i) {
380	$ppt_convert_type = "-g";
381	} elsif ($output_type =~ m/jp?g/i){
382	$ppt_convert_type = "-j";
383	} elsif ($output_type =~ m/png/i){
384	$ppt_convert_type = "-p";
385	}
386	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin",
387	$ENV{'GSDLOS'}, "pptextract");
388	$vbScript = "CScript //Nologo \"".$vbScript.".vbs\"" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # now we use the .vbs VBScript
389	# $vbScript = "pptextract" if ($ENV{'GSDLOS'} =~ m/^windows$/i); # back when the pptextract.exe VB executable was used
390
391	my $cmd = "";
392	if ($timeout) {$cmd = "ulimit -t $timeout;";}
393	# if the converting directory already exists
394	if (-d $output_filestem) {
395	print STDERR "**The conversion directory already exists\n";
396	return "item";
397	} else {
398	$cmd .= "$vbScript $ppt_convert_type \"$input_filename\" \"$output_filestem\"";
399	$cmd .= " 2>\"$output_filestem.err\""
400	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
401
402	if (system($cmd) !=0) {
403	print STDERR "Powerpoint VB Scripting convert failed\n";
404	} else {
405	return "item";
406	}
407	}
408	} elsif (!$output_type \|\| ($output_type =~ m/html/i)) {
409	# Attempt conversion to HTML
410	#if (!$output_type \|\| ($output_type =~ m/html/i)) {
411	# formulate the command
412	my $cmd = "";
413	my $full_perl_path = &util::get_perl_exec();
414	$cmd .= "\"$full_perl_path\" -S ppttohtml.pl ";
415	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
416	$cmd .= " 2>\"$output_filestem.err\""
417	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
418
419	# execute the command
420	$!=0;
421	if (system($cmd)!=0)
422	{
423	print STDERR "Powerpoint 95/97 converter failed $!\n";
424	} else {
425	return "html";
426	}
427	}
428
429	$success = &any_to_text($input_filename, $output_filestem);
430	if ($success) {
431	return "text";
432	}
433
434	return "fail";
435	}
436
437
438	sub convertXLS {
439	my ($input_filename, $output_filestem, $output_type) = @_;
440
441	my $success = 0;
442
443	# Attempt conversion to HTML
444	if (!$output_type \|\| ($output_type =~ m/html/i)) {
445	# formulate the command
446	my $cmd = "";
447	my $full_perl_path = &util::get_perl_exec();
448	$cmd .= "\"$full_perl_path\" -S xlstohtml.pl ";
449	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
450	$cmd .= " 2>\"$output_filestem.err\""
451	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
452
453
454	# execute the command
455	$!=0;
456	if (system($cmd)!=0)
457	{
458	print STDERR "Excel 95/97 converter failed $!\n";
459	} else {
460	return "html";
461	}
462	}
463
464	$success = &any_to_text($input_filename, $output_filestem);
465	if ($success) {
466	return "text";
467	}
468
469	return "fail";
470	}
471
472
473
474	# Find the real type of a .doc file
475	#
476	# We seem to have a lot of files with a .doc extension that are .rtf
477	# files or Word 5 files. This function attempts to tell the difference.
478	sub find_docfile_type {
479	my ($input_filename) = @_;
480
481	if (($windows_scripting) && ($input_filename =~ m/\.docx$/)) {
482	return "docx";
483	}
484
485	open(CHK, "<$input_filename");
486	binmode(CHK);
487	my $line = "";
488	my $first = 1;
489
490	while (<CHK>) {
491
492	$line = $_;
493
494	if ($first) {
495	# check to see if this is an rtf file
496	if ($line =~ m/^\{\\rtf/) {
497	close(CHK);
498	return "rtf";
499	}
500	$first = 0;
501	}
502
503	# is this is a word 6/7/8 document?
504	if ($line =~ m/Word\.Document\.([678])/) {
505	close(CHK);
506
507	return "word$1";
508	}
509
510	}
511
512	return "unknown";
513	}
514
515
516	# Specific type-to-type conversions
517	#
518	# Each of the following functions attempts to convert a document from
519	# a specific format to another. If they succeed they return 1 and leave
520	# the output document(s) in the appropriate place; if they fail they
521	# return 0 and delete any working files.
522
523
524	# Attempt to convert a word document to html with the wv program
525	sub doc_to_html {
526	my ($input_filename, $output_filestem) = @_;
527
528	my $wvware_status = 0;
529
530	# need to ensure that the path to perl is quoted (in case there's spaces in it)
531	my $launch_cmd = "\"".&util::get_perl_exec()."\" -S wvware.pl \"$input_filename\" \"$output_filestem\" \"$faillogfile\" $verbosity $timeout";
532
533	print STDERR "***** wvware launch cmd = $launch_cmd\n";
534
535	$wvware_status = system($launch_cmd)/256;
536	return $wvware_status;
537	}
538
539	# Attempt to convert a word document to html with the word2html scripting program
540	sub native_doc_to_html {
541	my ($input_filename, $output_filestem) = @_;
542
543	# build up the path to the doc-to-html conversion tool we're going to use
544	my $vbScript = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "bin", $ENV{'GSDLOS'});
545
546	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
547	# if windows scripting with docx input, use new VBscript to get the local Word install (if
548	# any) to do the conversion, since docX can't be processed by word2html's windows_scripting
549
550	if($input_filename =~ m/docx$/i) { # need to use full path to docx2html script,
551	# else script launch fails when there are error msgs
552	$vbScript = &FileUtils::filenameConcatenate($vbScript, "docx2html.vbs");
553	$vbScript = "CScript //Nologo \"$vbScript\""; # launch with CScript for error output in STDERR
554	# //Nologo flag avoids Microsoft's opening/logo msgs
555	print STDERR "About to use windows scripting to process docx file $input_filename.\n";
556	print STDERR " This may take some time. Please wait...\n";
557	}
558	else { # old doc versions. use the usual VB executable word2html for the
559	# conversion. Doesn't need full path, since bin\windows is on PATH
560	$vbScript = "word2html"; #$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
561	}
562	}
563	else { # not windows
564	$vbScript = "\"".&FileUtils::filenameConcatenate($vbScript, "word2html")."\"";
565	}
566
567	if (-e "$output_filestem.html") {
568	print STDERR " The conversion file:\n";
569	print STDERR " $output_filestem.html\n";
570	print STDERR " ... already exists. Skipping\n";
571	return 1;
572	}
573
574	my $cmd = "";
575	if ($timeout) {$cmd = "ulimit -t $timeout;";}
576	#$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
577	#$cmd .= "$vbScript $input_filename $output_filestem.html";
578	$cmd .= "$vbScript \"$input_filename\" \"$output_filestem.html\"";
579
580	# redirecting STDERR
581
582	$cmd .= " 2> \"$output_filestem.err\""
583	if ($ENV {'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
584	#print STDERR "@@@@@@@@@ cmd=$cmd\n";
585
586	# execute the command
587	$!=0;
588	if (system($cmd)!=0)
589	{
590	print STDERR "Error executing $vbScript converter:$!\n";
591	if (-s "$output_filestem.err") {
592	open (ERRFILE, "<$output_filestem.err");
593
594	my $write_to_fail_log=0;
595	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
596	{$write_to_fail_log=1;}
597
598	my $line;
599	while ($line=<ERRFILE>) {
600	if ($line =~ m/\w/) {
601	print STDERR "$line";
602	print FAILLOG "$line" if ($write_to_fail_log);
603	}
604	if ($line !~ m/startup error/) {next;}
605	print STDERR " (given an invalid .DOC file?)\n";
606	print FAILLOG " (given an invalid .DOC file?)\n"
607	if ($write_to_fail_log);
608
609	} # while ERRFILE
610	close FAILLOG if ($write_to_fail_log);
611	}
612	return 0; # we can try any_to_text
613	}
614
615	# Was the conversion successful?
616	if (-s "$output_filestem.html") {
617	open(TMP, "$output_filestem.html");
618	my $line = <TMP>;
619	close(TMP);
620	if ($line && $line =~ m/html/i) {
621	&FileUtils::removeFiles("$output_filestem.err") if -e "$output_filestem.err";
622	return 1;
623	}
624	}
625
626	# If here, an error of some sort occurred
627	&FileUtils::removeFiles("$output_filestem.html") if -e "$output_filestem.html";
628	if (-e "$output_filestem.err") {
629	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
630	open (ERRLOG,"$output_filestem.err");
631	while (<ERRLOG>) {print FAILLOG $_;}
632	close FAILLOG;
633	close ERRLOG;
634	}
635	&FileUtils::removeFiles("$output_filestem.err");
636	}
637	return 0;
638	}
639
640	# Attempt to convert an RTF document to html with rtftohtml
641	sub rtf_to_html {
642	my ($input_filename, $output_filestem) = @_;
643
644	# formulate the command
645	my $cmd = "";
646	if ($timeout) {$cmd = "ulimit -t $timeout;";}
647	$cmd .= "rtftohtml";
648	#$cmd .= "rtf-converter";
649
650	$cmd .= " -o \"$output_filestem.html\" \"$input_filename\"";
651
652	$cmd .= " 2>\"$output_filestem.err\""
653	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000);
654
655
656	# execute the command
657	$!=0;
658	if (system($cmd)!=0)
659	{
660	print STDERR "Error executing rtf converter $!\n";
661	# don't currently bother printing out error log...
662	# keep going, in case it still created an HTML file...
663	}
664
665	# Was the conversion successful?
666	my $was_successful=0;
667	if (-s "$output_filestem.html") {
668	# make sure we have some content other than header
669	open (HTML, "$output_filestem.html"); # what to do if fail?
670	my $line;
671	my $past_header=0;
672	while ($line=<HTML>) {
673
674	if ($past_header == 0) {
675	if ($line =~ m/<body>/) {$past_header=1;}
676	next;
677	}
678
679	$line =~ s/<[^>]+>//g;
680	if ($line =~ m/\w/ && $past_header) { # we found some content...
681	$was_successful=1;
682	last;
683	}
684	}
685	close HTML;
686	}
687
688	if ($was_successful) {
689	&FileUtils::removeFiles("$output_filestem.err")
690	if (-e "$output_filestem.err");
691	# insert the (modified) table of contents, if it exists.
692	if (-e "${output_filestem}_ToC.html") {
693	&FileUtils::moveFiles("$output_filestem.html","$output_filestem.src");
694	my $open_failed=0;
695	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
696	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
697	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
698
699	if ($open_failed) {
700	close HTMLSRC;
701	close TOC;
702	close HTML;
703	&FileUtils::moveFiles("$output_filestem.src","$output_filestem.html");
704	return 1;
705	}
706
707	# print out header info from src html.
708	while (defined($_ = <HTMLSRC>) && $_ =~ m/\w/) {
709	print HTML "$_";
710	}
711
712	# print out table of contents, making links relative
713	<TOC>; <TOC>; # ignore first 2 lines
714	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
715	my $line;
716	while ($line=<TOC>) {
717	$line =~ s@</body></html>$@@i ; # only last line has this
718	# make link relative
719	$line =~ s@href=\"[^\#]+@href=\"@i;
720	print HTML $line;
721	}
722	close TOC;
723
724	# rest of html src
725	while (<HTMLSRC>) {
726	print HTML $_;
727	}
728	close HTMLSRC;
729	close HTML;
730
731	&FileUtils::removeFiles("${output_filestem}_ToC.html");
732	&FileUtils::removeFiles("${output_filestem}.src");
733	}
734	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
735	return 1; # success
736	}
737
738	if (-e "$output_filestem.err") {
739	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
740	{
741	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
742	#print FAILLOG "Error - rtf-converter - couldn't extract text\n";
743	print FAILLOG " (rtf file might be too recent):\n";
744	open (ERRLOG, "$output_filestem.err");
745	while (<ERRLOG>) {print FAILLOG $_;}
746	close ERRLOG;
747	close FAILLOG;
748	}
749	&FileUtils::removeFiles("$output_filestem.err");
750	}
751
752	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
753
754	return 0;
755	}
756
757
758	# Convert a pdf file to html with the pdftohtml command
759
760	sub pdf_to_html {
761	my ($dirname, $input_filename, $output_filestem) = @_;
762
763	my $cmd = "";
764	if ($timeout) {$cmd = "ulimit -t $timeout;";}
765	my $full_perl_path = &util::get_perl_exec();
766	$cmd .= "\"$full_perl_path\" -S pdftohtml.pl -zoom $pdf_zoom";
767	$cmd .= " -c" if ($pdf_complex);
768	$cmd .= " -i" if ($pdf_ignore_images);
769	$cmd .= " -a" if ($pdf_allow_images_only);
770	$cmd .= " -hidden" unless ($pdf_nohidden);
771	$cmd .= " \"$input_filename\" \"$output_filestem\"";
772
773	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
774	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
775	} else {
776	$cmd .= " > \"$output_filestem.err\"";
777	}
778
779	$!=0;
780
781	my $retval=system($cmd);
782	if ($retval!=0)
783	{
784	print STDERR "Error executing pdftohtml.pl";
785	if ($!) {print STDERR ": $!";}
786	print STDERR "\n";
787	}
788
789	# make sure the converter made something
790	if ($retval!=0 \|\| ! -s "$output_filestem.html")
791	{
792	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
793	# print out the converter's std err, if any
794	if (-s "$output_filestem.err") {
795	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
796	print STDERR "pdftohtml error log:\n";
797	while (<ERRLOG>) {
798	print STDERR "$_";
799	}
800	close ERRLOG;
801	}
802	#print STDERR "***********output filestem $output_filestem.html\n";
803	&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
804	if (-e "$output_filestem.err") {
805	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
806	{
807	open (ERRLOG, "$output_filestem.err");
808	while (<ERRLOG>) {print FAILLOG $_;}
809	close ERRLOG;
810	close FAILLOG;
811	}
812	&FileUtils::removeFiles("$output_filestem.err");
813	}
814	return 0;
815	}
816
817	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
818	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
819	return 1;
820	}
821
822	# Convert a pdf file to various types of image with the convert command
823
824	sub pdfps_to_img {
825	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
826
827	# Check that ImageMagick is installed and available on the path (except for Windows 95/98)
828	if (!($ENV{'GSDLOS'} eq "windows" && !Win32::IsWinNT())) {
829	my $imagick_cmd = "\"".&util::get_perl_exec()."\" -S gs-magick.pl";
830	$imagick_cmd = $imagick_cmd." --verbosity=$verbosity" if defined $verbosity;
831	my $result = `$imagick_cmd identify 2>&1`;
832
833	# Linux and Windows return different values for "program not found".
834	# Linux returns -1 and Windows 256 for "program not found". But once they're
835	# converted to signed values, it will be -1 for Linux and 1 for Windows.
836	# Whenever we test for return values other than 0, shift by 8 and perform
837	# unsigned to signed status conversion on $? to get expected range of return vals
838	# Although gs-magick.pl already shifts its $? by 8, converts it to a signed value
839	# and then exits on that, by the time we get here, we need to do it again
840	my $status = $?;
841	$status >>= 8;
842	$status = (($status & 0x80) ? -(0x100 - ($status & 0xFF)) : $status);
843	if (($ENV{'GSDLOS'} ne "windows" && $status == -1) \|\| ($ENV{'GSDLOS'} eq "windows" && $status == 1)) {
844	# if ($status == -1 \|\| $status == 1) #if ($status == -1 \|\| $status == 256) {
845	#ImageMagick is not installed, thus the convert utility is not available.
846	print STDERR "*** ImageMagick is not installed, the convert utility is not available. Unable to convert PDF/PS to images. Status: $status\n";
847	return 0;
848	}
849	}
850
851	my $cmd = "";
852	if ($timeout) {$cmd = "ulimit -t $timeout;";}
853	$output_type =~ s/.\_(.)/$1/i;
854	my $full_perl_path = &util::get_perl_exec();
855	$cmd .= "\"$full_perl_path\" -S pdfpstoimg.pl -convert_to $output_type \"$input_filename\" \"$output_filestem\"";
856	if ($ENV{'GSDLOS'} !~ m/^windows$/i \|\| $is_winnt_2000) {
857	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
858	} else {
859	$cmd .= " > \"$output_filestem.err\"";
860	}
861
862	# don't include path on windows (to avoid having to play about
863	# with quoting when GSDLHOME might contain spaces) but assume
864	# that the PATH is set up correctly
865	$!=0;
866	my $retval=system($cmd);
867	if ($retval!=0)
868	{
869	print STDERR "Error executing pdfpstoimg.pl";
870	if ($!) {print STDERR ": $!";}
871	print STDERR "\n";
872	}
873
874	#make sure the converter made something
875	#if ($retval !=0) \|\| ! -s "$output_filestem")
876	if ($retval !=0)
877	{
878	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
879	#print out the converter's std err, if any
880	if (-s "$output_filestem.err") {
881	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
882	print STDERR "pdfpstoimg error log:\n";
883	while (<ERRLOG>) {
884	print STDERR "$_";
885	}
886	close ERRLOG;
887	}
888	#&FileUtils::removeFiles("$output_filestem.html") if (-e "$output_filestem.html");
889	if (-e "$output_filestem.err") {
890	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
891	{
892	open (ERRLOG, "$output_filestem.err");
893	while (<ERRLOG>) {print FAILLOG $_;}
894	close ERRLOG;
895	close FAILLOG;
896	}
897	&FileUtils::removeFiles("$output_filestem.err");
898	}
899	return 0;
900	}
901	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
902	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
903	return 1;
904	}
905
906	# Convert a PDF file to text with the pdftotext command
907
908	sub pdf_to_text {
909	my ($dirname, $input_filename, $output_filestem) = @_;
910
911	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
912
913	if ($ENV{'GSDLOS'} !~ m/^windows$/i) {
914	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
915	} else {
916	$cmd .= " > \"$output_filestem.err\"";
917	}
918
919	if (system($cmd)!=0)
920	{
921	print STDERR "Error executing $cmd: $!\n";
922	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
923	}
924
925	# make sure there is some extracted text.
926	if (-e "$output_filestem.text") {
927	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
928	binmode(EXTR_TEXT); # just in case...
929	my $line="";
930	my $seen_text=0;
931	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
932	if ($line=~ m/\w/) {$seen_text=1;}
933	}
934	close EXTR_TEXT;
935	if ($seen_text==0) { # no text was extracted
936	print STDERR "Error: pdftotext found no text\n";
937	&FileUtils::removeFiles("$output_filestem.text");
938	}
939	}
940
941	# make sure the converter made something
942	if (! -s "$output_filestem.text")
943	{
944	# print out the converters std err, if any
945	if (-s "$output_filestem.err") {
946	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
947	print STDERR "pdftotext error log:\n";
948	while (<ERRLOG>) {
949	print STDERR "$_";
950	}
951	close ERRLOG;
952	}
953	# does this converter create a .out file?
954	&FileUtils::removeFiles("$output_filestem.out") if (-e "$output_filestem.out");
955	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
956	if (-e "$output_filestem.err") {
957	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
958	{
959	open (ERRLOG,"$output_filestem.err");
960	while (<ERRLOG>) {print FAILLOG $_;}
961	close ERRLOG;
962	close FAILLOG;
963	}
964	&FileUtils::removeFiles("$output_filestem.err");
965	}
966	return 0;
967	}
968	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
969	return 1;
970	}
971
972	# Convert a PostScript document to text
973	# note - just using "ps2ascii" isn't good enough, as it
974	# returns 0 for a postscript interpreter error. ps2ascii is just
975	# a wrapper to "gs" anyway, so we use that cmd here.
976
977	sub ps_to_text {
978	my ($input_filename, $output_filestem) = @_;
979
980	my $error = "";
981
982	# if we're on windows we'll fall straight through without attempting
983	# to use gs
984	if ($ENV{'GSDLOS'} =~ m/^windows$/i) {
985	$error = "Windows does not support gs";
986
987	} else {
988	my $cmd = "";
989	if ($timeout) {$cmd = "ulimit -t $timeout; ";}
990	$cmd .= "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
991	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
992	#$cmd .= "pstotext -output \"$output_filestem.text\" $input_filename\"";
993	$cmd .= " 2> $output_filestem.err";
994	$!=0;
995
996	my $retcode=system($cmd);
997	$retcode = $? >> 8; # see man perlfunc - system for this...
998	# if system returns -1 \| 127 (couldn't start program), look at $! for message
999
1000	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
1001	elsif (! -e "$output_filestem.text") {
1002	$error="did not create output file.\n";
1003	}
1004	else
1005	{ # make sure the interpreter didn't get an error. It is technically
1006	# possible for the actual text to start with this, but....
1007	open PSOUT, "$output_filestem.text";
1008	if (<PSOUT> =~ m/^Error: (.*)/) {
1009	$error="interpreter error - \"$1\"";
1010	}
1011	close PSOUT;
1012	}
1013	}
1014
1015	if ($error ne "")
1016	{
1017	print STDERR "Warning: Error executing gs: $error\n";
1018	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1019
1020	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
1021	{
1022	print FAILLOG "gs - $error\n";
1023	if (-e "$output_filestem.err") {
1024	open(ERRLOG, "$output_filestem.err");
1025	while (<ERRLOG>) {print FAILLOG $_;}
1026	close ERRLOG;
1027	}
1028	close FAILLOG;
1029	}
1030	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1031
1032
1033	# Fine then. We'll just do a lousy job by ourselves...
1034	# Based on 5-line regexp sed script found at:
1035	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
1036	#
1037	print STDERR "Stripping text from postscript\n";
1038	my $errorcode=0;
1039	open (IN, "$input_filename")
1040	\|\| ($errorcode=1, warn "Couldn't read file: $!");
1041	open (OUT, ">$output_filestem.text")
1042	\|\| ($errorcode=1, warn "Couldn't write file: $!");
1043	if ($errorcode) {print STDERR "errors\n";return 0;}
1044
1045	my $text=""; # this is for whole .ps file...
1046	$text = join('', <IN>); # see man perlport, under "System Resources"
1047	close IN;
1048
1049	# Make sure this is a ps file...
1050	if ($text !~ m/^%!/) {
1051	print STDERR "Bad postscript header: not '%!'\n";
1052	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
1053	{
1054	print FAILLOG "Bad postscript header: not '%!'\n";
1055	close FAILLOG;
1056	}
1057	return 0;
1058	}
1059
1060	# if ps has Page data, then use it to delete all stuff before it.
1061	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
1062
1063	# remove all leading non-data stuff
1064	$text =~ s/^.*?\(//s;
1065
1066	# remove all newline chars for easier processing
1067	$text =~ s/\n//g;
1068
1069	# Big assumption here - assume that if any co-ordinates are
1070	# given, then we are at the end of a sentence.
1071	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
1072
1073	# special characters--
1074	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
1075
1076	# ? ps text formatting (eg italics?) ?
1077	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
1078	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
1079	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
1080	# default - remove the rest
1081	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
1082
1083	# attempt to add whitespace between words...
1084	# this is based purely on observation, and may be completely wrong...
1085	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
1086	# eg I notice "b(" is sometimes NOT a space if preceded by a
1087	# negative number.
1088	$text =~ s/\)\d+ ?b\(/\) \( /g;
1089
1090	# change quoted braces to brackets
1091	$text =~ s/([^\\])\\\(/$1\{/g;
1092	$text =~ s/([^\\])\\\)/$1\}/g ;
1093
1094	# remove everything that is not between braces
1095	$text =~ s/\)([^\(\)])+?\(//sg ;
1096
1097	# remove any Trailer eof stuff.
1098	$text =~ s/\)[^\)]*$//sg;
1099
1100	### ligatures have special characters...
1101	$text =~ s/\\013/ff/g;
1102	$text =~ s/\\014/fi/g;
1103	$text =~ s/\\015/fl/g;
1104	$text =~ s/\\016/ffi/g;
1105	$text =~ s/\\214/fi/g;
1106	$text =~ s/\\215/fl/g;
1107	$text =~ s/\\017/\n\* /g; # asterisk?
1108	$text =~ s/\\023/\023/g; # e acute ('e)
1109	$text =~ s/\\177/\252/g; # u"
1110	# $text =~ s/ ?? /\344/g; # a"
1111
1112	print OUT "$text";
1113	close OUT;
1114	}
1115	# wrap the text - use a minimum length. ie, first space after this length.
1116	my $wrap_length=72;
1117	&FileUtils::moveFiles("$output_filestem.text", "$output_filestem.text.tmp");
1118	open INFILE, "$output_filestem.text.tmp" \|\|
1119	die "Couldn't open file: $!";
1120	open OUTFILE, ">$output_filestem.text" \|\|
1121	die "Couldn't open file for writing: $!";
1122	my $line="";
1123	while ($line=<INFILE>) {
1124	while (length($line)>0) {
1125	if (length($line)>$wrap_length) {
1126	$line =~ s/^(.{$wrap_length}[^\s])\s//;
1127	print OUTFILE "$1\n";
1128	} else {
1129	print OUTFILE "$line";
1130	$line="";
1131	}
1132	}
1133	}
1134	close INFILE;
1135	close OUTFILE;
1136	&FileUtils::removeFiles("$output_filestem.text.tmp");
1137
1138	&FileUtils::removeFiles("$output_filestem.err") if (-e "$output_filestem.err");
1139	return 1;
1140	}
1141
1142
1143	# Convert any file to HTML with a crude perl implementation of the
1144	# UNIX strings command.
1145
1146	sub any_to_html {
1147	my ($input_filename, $output_filestem) = @_;
1148
1149	# First generate a text file
1150	return 0 unless (&any_to_text($input_filename, $output_filestem));
1151
1152	# create an HTML file from the text file
1153	open(TEXT, "<$output_filestem.text");
1154	open(HTML, ">$output_filestem.html");
1155
1156	print HTML "<html><head>\n";
1157	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
1158	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
1159	print HTML "</head><body>\n\n";
1160
1161	my $line;
1162	while ($line=<TEXT>) {
1163	$line =~ s/</</g;
1164	$line =~ s/>/>/g;
1165	if ($line =~ m/^\s*$/) {
1166	print HTML "<p>";
1167	} else {
1168	print HTML "<br> ", $line;
1169	}
1170	}
1171	print HTML "\n</body></html>\n";
1172
1173	close HTML;
1174	close TEXT;
1175
1176	&FileUtils::removeFiles("$output_filestem.text") if (-e "$output_filestem.text");
1177	return 1;
1178	}
1179
1180	# Convert any file to TEXT with a crude perl implementation of the
1181	# UNIX strings command.
1182	# Note - this assumes ascii charsets :( (jrm21)
1183
1184	sub any_to_text {
1185	my ($input_filename, $output_filestem) = @_;
1186
1187	if (!$use_strings) {
1188	return 0;
1189	}
1190
1191	print STDERR "\n** In any to text**\n\n";
1192	open(IN, "<$input_filename") \|\| return 0;
1193	binmode(IN);
1194	open(OUT, ">$output_filestem.text") \|\| return 0;
1195
1196	my ($line);
1197	my $output_line_count = 0;
1198	while (<IN>) {
1199	$line = $_;
1200
1201	# delete anything that isn't a printable character
1202	$line =~ s/[^\040-\176]+/\n/sg;
1203
1204	# delete any string less than 10 characters long
1205	$line =~ s/^.{0,9}$/\n/mg;
1206	while ($line =~ m/^.{1,9}$/m) {
1207	$line =~ s/^.{0,9}$/\n/mg;
1208	$line =~ s/\n+/\n/sg;
1209	}
1210
1211	# remove extraneous whitespace
1212	$line =~ s/\n+/\n/gs;
1213	$line =~ s/^\n//gs;
1214
1215	# output whatever is left
1216	if ($line =~ m/[^\n ]/) {
1217	print OUT $line;
1218	++$output_line_count;
1219	}
1220	}
1221
1222	close OUT;
1223	close IN;
1224
1225	if ($output_line_count) { # try to protect against binary only formats
1226	return 1;
1227	}
1228
1229	&FileUtils::removeFiles("$output_filestem.text");
1230	return 0;
1231
1232	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: