Context Navigation

source: trunk/gsdl/bin/script/gsConvert.pl@ 2977

Last change on this file since 2977 was 2977, checked in by jrm21, 22 years ago
added infrastructure for calling an external powerpoint to html converter.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 25.4 KB

Rev	Line
[1445]	1	#!/usr/bin/perl -w
	2
	3	###########################################################################
	4	#
[2032]	5	# gsConvert.pl -- convert documents to HTML or TEXT format
[1445]	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
	11	# Copyright (C) 1999 New Zealand Digital Library Project
	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
	29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
[2755]	30	# by exploiting third-party programs. The sources of these are usually found
	31	# in the $GSDLHOME/packages directory, and the executables should live in
	32	# $GSDLHOME/bin/$GSDLOS (which is on the search path).
[1445]	33	#
[2755]	34	# Currently, we can convert Microsoft Word, RTF, Adobe PDF and PostScript
	35	# using specialised conversion utilities. We can try to convert any file to
	36	# text with a perl implementation of the UNIX strings command.
[2032]	37	#
	38	# We try to convert Postscript files to text using "gs" which is often on
[2755]	39	# *nix machines. We fall back to performing weak text extraction by using
	40	# regular expressions.
[1445]	41
	42	BEGIN {
	43	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	44	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
	45	}
	46
	47	use parsargv;
	48	use util;
	49	use Cwd;
	50	use File::Basename;
	51
[2755]	52	# Are we running on WinNT or Win2000 (or later)?
	53	my $is_winnt_2000=eval {require Win32; return (Win32::IsWinNT()); return 0;};
	54	if (!defined($is_winnt_2000)) {$is_winnt_2000=0;}
[1445]	55
	56	sub print_usage
	57	{
[1970]	58	print STDERR "\n";
	59	print STDERR "gsConvert.pl: Converts documents in a range of formats to html\n";
	60	print STDERR " or text using third-party programs.\n\n";
	61	print STDERR " usage: $0 [options] filename\n";
[2977]	62	print STDERR " options:\n\t-type\tdoc\|pdf\|ps\|ppt\|rtf\t(input file type)\n";
[2755]	63	print STDERR "\t-errlog\t<filename>\t(append err messages)\n";
	64	print STDERR "\t-output\thtml\|text\n";
	65	print STDERR "\t-timeout\t<max cpu seconds>\t(ulimit on unix systems)\n";
[1445]	66	exit(1);
	67	}
	68
[2755]	69	my $faillogfile="";
[1445]	70
	71	sub main
	72	{
	73	my (@ARGV) = @_;
[1692]	74	my ($input_type,$output_type,$verbose,$timeout);
[1960]	75
[1692]	76	$timeout = 0;
[1445]	77	# read command-line arguments
	78	if (!parsargv::parse(\@ARGV,
[2977]	79	'type/(doc\|pdf\|ps\|ppt\|rtf)/', \$input_type,
[2755]	80	'/errlog/.*/', \$faillogfile,
[1445]	81	'output/(html\|text)/', \$output_type,
[1692]	82	'timeout/\d+/0',\$timeout,
[1445]	83	'verbose/\d+/0', \$verbose))
	84	{
	85	print_usage();
	86	}
	87
	88	# Make sure the input file exists and can be opened for reading
	89	if (scalar(@ARGV!=1)) {
	90	print_usage();
	91	}
[1928]	92
[1445]	93	my $input_filename = $ARGV[0];
	94	if (!-r $input_filename) {
	95	print STDERR "Error: unable to open $input_filename for reading\n";
	96	exit(1);
	97	}
	98
	99	# Deduce filenames
	100	my ($tailname,$dirname,$suffix)
[2241]	101	= File::Basename::fileparse($input_filename, "\\.[^\\.]+\$");
	102	my $output_filestem = &util::filename_cat($dirname, "$tailname");
[1445]	103
	104	if ($input_type eq "")
	105	{
[2241]	106	$input_type = lc (substr($suffix,1,length($suffix)-1));
[1445]	107	}
	108
	109	# Change to temporary working directory
	110	my $stored_dir = cwd();
	111	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
	112
	113	# Select convert utility
	114	if (!defined $input_type) {
	115	print STDERR "Error: No filename extension or input type defined\n";
	116	exit(1);
	117	}
	118	elsif ($input_type eq "doc") {
	119	print &convertDOC($input_filename, $output_filestem, $output_type);
	120	print "\n";
	121	}
[1684]	122	elsif ($input_type eq "rtf") {
	123	print &convertRTF($input_filename, $output_filestem, $output_type);
	124	print "\n";
	125	}
[1445]	126	elsif ($input_type eq "pdf") {
	127	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
	128	print "\n";
	129	}
	130	elsif ($input_type eq "ps") {
	131	print &convertPS($input_filename, $output_filestem, $output_type);
	132	print "\n";
	133	}
[2977]	134	elsif ($input_type eq "ppt") {
	135	print &convertPPT($input_filename, $output_filestem, $output_type);
	136	print "\n";
	137	}
[1445]	138	else {
	139	print STDERR "Error: Unable to convert type '$input_type'\n";
	140	exit(1);
	141	}
	142
	143	# restore to original working directory
	144	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
	145
	146	}
	147
	148	&main(@ARGV);
	149
	150
	151
[2241]	152	# Document-type conversion functions
[1445]	153	#
	154	# The following functions attempt to convert documents from their
	155	# input type to the specified output type. If no output type was
	156	# given, then they first attempt HTML, and then TEXT.
	157	#
	158	# Each returns the output type ("html" or "text") or "fail" if no
	159	# conversion is possible.
	160
	161	# Convert a Microsoft word document
	162
	163	sub convertDOC {
	164	($input_filename, $output_filestem, $output_type) = @_;
	165
[1654]	166	# Many .doc files are not in fact word documents!
	167	my $realtype = &find_docfile_type($input_filename);
	168
[1734]	169	if ($realtype eq "word6" \|\| $realtype eq "word7" \|\| $realtype eq "word8") {
[1654]	170	return &convertWord678($input_filename, $output_filestem, $output_type);
	171	} elsif ($realtype eq "rtf") {
	172	return &convertRTF($input_filename, $output_filestem, $output_type);
	173	} else {
	174	return &convertAnything($input_filename, $output_filestem, $output_type);
	175	}
	176	}
	177
	178	# Convert a Microsoft word 6/7/8 document
	179
	180	sub convertWord678 {
	181	($input_filename, $output_filestem, $output_type) = @_;
	182
[1445]	183	my $success = 0;
	184
	185	# Attempt specialised conversion to HTML
	186	if (!$output_type \|\| ($output_type =~ /html/i)) {
	187	$success = &doc_to_html($input_filename, $output_filestem);
	188	if ($success) {
	189	return "html";
	190	}
	191	}
	192
[1654]	193	return &convertAnything($input_filename, $output_filestem, $output_type);
	194	}
	195
	196
	197	# Convert a Rich Text Format (RTF) file
	198
	199	sub convertRTF {
	200	($input_filename, $output_filestem, $output_type) = @_;
	201
	202	my $success = 0;
	203
	204	# Attempt specialised conversion to HTML
	205	if (!$output_type \|\| ($output_type =~ /html/i)) {
	206	$success = &rtf_to_html($input_filename, $output_filestem);
	207	if ($success) {
	208	return "html";
	209	}
	210	}
	211
[2755]	212	# rtf is so ugly that's it's not worth running strings over.
	213	# One day I'll write some quick'n'dirty regexps to try to extract text - jrm21
	214	# return &convertAnything($input_filename, $output_filestem, $output_type);
	215	return "fail";
[1654]	216	}
	217
	218
	219	# Convert an unidentified file
	220
	221	sub convertAnything {
	222	($input_filename, $output_filestem, $output_type) = @_;
	223
	224	my $success = 0;
	225
[1445]	226	# Attempt simple conversion to HTML
	227	if (!$output_type \|\| ($output_type =~ /html/i)) {
	228	$success = &any_to_html($input_filename, $output_filestem);
	229	if ($success) {
	230	return "html";
	231	}
	232	}
	233
	234	# Convert to text
	235	if (!$output_type \|\| ($output_type =~ /text/i)) {
[2241]	236	$success = &any_to_text($input_filename, $output_filestem);
[1445]	237	if ($success) {
	238	return "text";
	239	}
	240	}
	241	return "fail";
	242	}
	243
	244
[1654]	245
[1445]	246	# Convert an Adobe PDF document
	247
	248	sub convertPDF {
[2755]	249	my ($dirname, $input_filename, $output_filestem, $output_type) = @_;
[1445]	250
	251	my $success = 0;
	252
	253	# Attempt conversion to HTML
	254	if (!$output_type \|\| ($output_type =~ /html/i)) {
	255	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
	256	if ($success) {
	257	return "html";
	258	}
	259	}
	260
	261	# Attempt conversion to TEXT
	262	if (!$output_type \|\| ($output_type =~ /text/i)) {
[2117]	263	$success = &pdf_to_text($dirname, $input_filename, $output_filestem);
[1445]	264	if ($success) {
	265	return "text";
	266	}
	267	}
	268
	269	return "fail";
	270
	271	}
	272
	273
	274	# Convert an Adobe PostScript document
	275
	276	sub convertPS {
	277	($input_filename, $output_filestem, $output_type) = @_;
	278
	279	my $success = 0;
	280
	281	# Attempt conversion to TEXT
	282	if (!$output_type \|\| ($output_type =~ /text/i)) {
	283	$success = &ps_to_text($input_filename, $output_filestem);
	284	if ($success) {
	285	return "text";
	286	}
	287	}
	288
	289	return "fail";
	290
	291	}
	292
	293
[2977]	294	sub convertPPT {
	295	my ($input_filename, $output_filestem, $output_type) = @_;
	296
	297	my $success = 0;
	298
	299	# Attempt conversion to HTML
	300	if (!$output_type \|\| ($output_type =~ /html/i)) {
	301	# formulate the command
	302	$cmd = "";
	303	$cmd .= "perl -S ppttohtml.pl ";
	304	$cmd .= " \"$input_filename\" \"$output_filestem.html\"";
	305	$cmd .= " 2>\"$output_filestem.err\""
	306	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
	307
	308
	309	# execute the command
	310	$!=0;
	311	if (system($cmd)!=0)
	312	{
	313	print STDERR "Powerpoint converter failed $!\n";
	314	} else {
	315	return "html";
	316	}
	317	}
	318
	319	$success = &any_to_text($input_filename, $output_filestem);
	320	if ($success) {
	321	return "text";
	322	}
	323
	324	return "fail";
	325	}
	326
	327
	328
	329
	330
[1654]	331	# Find the real type of a .doc file
	332	#
[2012]	333	# We seem to have a lot of files with a .doc extension that are .rtf
[1654]	334	# files or Word 5 files. This function attempts to tell the difference.
[1445]	335
[1654]	336	sub find_docfile_type {
	337	($input_filename) = @_;
	338
	339	open(CHK, "<$input_filename");
[1734]	340	binmode(CHK);
[1654]	341	my $line = "";
	342	my $first = 1;
	343
	344	while (<CHK>) {
	345
	346	$line = $_;
[1960]	347
[1654]	348	if ($first) {
	349	# check to see if this is an rtf file
	350	if ($line =~ /^\{\\rtf/) {
	351	close(CHK);
	352	return "rtf";
	353	}
[2755]	354	$first = 0;
[1654]	355	}
	356
[1734]	357	# is this is a word 6/7/8 document?
	358	if ($line =~ /Word\.Document\.([678])/) {
[1654]	359	close(CHK);
[1734]	360	return "word$1";
[1654]	361	}
	362
	363	}
	364
	365	return "unknown";
	366	}
	367
	368
	369
[1734]	370	# Specific type-to-type conversions
[1445]	371	#
	372	# Each of the following functions attempts to convert a document from
[2755]	373	# a specific format to another. If they succeed they return 1 and leave
[1445]	374	# the output document(s) in the appropriate place; if they fail they
	375	# return 0 and delete any working files.
	376
	377
	378	# Attempt to convert a word document to html with the wv program
	379
	380	sub doc_to_html {
	381	($input_filename, $output_filestem) = @_;
	382
[2023]	383	my $wvWare = &util::filename_cat($ENV{'GSDLHOME'}, "bin",
	384	$ENV{'GSDLOS'}, "wvWare");
[1928]	385
[2241]	386	# don't include path on windows (to avoid having to play about
	387	# with quoting when GSDLHOME might contain spaces) but assume
	388	# that the PATH is set up correctly
	389	$wvWare = "wvWare" if ($ENV{'GSDLOS'} =~ /^windows$/i);
	390
[2512]	391	my $wv_conf = &util::filename_cat($ENV{'GSDLHOME'}, "etc",
[2574]	392	"packages", "wv", "wvHtml.xml");
[1928]	393
[2241]	394	my $cmd = "";
[1692]	395	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[2241]	396	$cmd .= "$wvWare --charset utf-8 --config \"$wv_conf\"";
	397	$cmd .= " \"$input_filename\" > \"$output_filestem.html\"";
	398
	399	# redirecting STDERR is a bad idea on windows 95/98
	400	$cmd .= " 2> \"$output_filestem.err\""
[2755]	401	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
[1654]	402
[1445]	403	# execute the command
[2755]	404	$!=0;
[2060]	405	if (system($cmd)!=0)
[1445]	406	{
[2755]	407	print STDERR "Error executing wv converter:$!\n";
	408	if (-s "$output_filestem.err") {
	409	open (ERRFILE, "<$output_filestem.err");
	410
	411	my $write_to_fail_log=0;
	412	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	413	{$write_to_fail_log=1;}
	414
	415	my $line;
	416	while ($line=<ERRFILE>) {
	417	if ($line =~ /\w/) {
	418	print STDERR "$line";
	419	print FAILLOG "$line" if ($write_to_fail_log);
	420	}
	421	if ($line !~ m/startup error/) {next;}
	422	print STDERR " (given an invalid .DOC file?)\n";
	423	print FAILLOG " (given an invalid .DOC file?)\n"
	424	if ($write_to_fail_log);
	425
	426	} # while ERRFILE
	427	close FAILLOG if ($write_to_fail_log);
	428	}
	429	print STDERR "Continuing...\n";
	430	return 0; # we can try any_to_text
[1445]	431	}
[1578]	432
[1445]	433	# Was the conversion successful?
[2241]	434
[2656]	435	if (-s "$output_filestem.html") {
[1445]	436	open(TMP, "$output_filestem.html");
	437	$line = <TMP>;
	438	close(TMP);
[1578]	439	if ($line && $line =~ /DOCTYPE HTML/) {
[2241]	440	&util::rm("$output_filestem.err") if -e "$output_filestem.err";
[1445]	441	return 1;
	442	}
	443	}
[2755]	444
	445	# If here, an error of some sort occurred
	446	&util::rm("$output_filestem.html") if -e "$output_filestem.html";
	447	if (-e "$output_filestem.err") {
	448	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile"))) {
	449	open (ERRLOG,"$output_filestem.err");
	450	while (<ERRLOG>) {print FAILLOG $_;}
	451	close FAILLOG;
	452	close ERRLOG;
	453	}
	454	&util::rm("$output_filestem.err");
	455	}
	456
[1445]	457	return 0;
	458	}
	459
	460
[1654]	461	# Attempt to convert an RTF document to html with rtftohtml
	462
	463	sub rtf_to_html {
[2241]	464	my ($input_filename, $output_filestem) = @_;
[1654]	465
	466	# formulate the command
[1692]	467	$cmd = "";
	468	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[2574]	469	$cmd .= "rtftohtml";
[1654]	470
[2574]	471	# it automatically uses $output_filestem.html
	472	$cmd .= " \"$input_filename\"";
	473
	474	$cmd .= " 2>\"$output_filestem.err\""
[2755]	475	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000);
[2574]	476
	477
[1654]	478	# execute the command
[2755]	479	$!=0;
[2060]	480	if (system($cmd)!=0)
[1654]	481	{
[2755]	482	print STDERR "Error executing rtf converter $!\n";
[2656]	483	# don't currently bother printing out error log...
	484	# keep going, in case it still created an HTML file...
[1654]	485	}
	486
	487	# Was the conversion successful?
[2755]	488	my $was_successful=0;
[2656]	489	if (-s "$output_filestem.html") {
[2755]	490	# make sure we have some content other than header
	491	open (HTML, "$output_filestem.html"); # what to do if fail?
	492	my $line;
	493	my $past_header=0;
	494	while ($line=<HTML>) {
	495
	496	if ($past_header == 0) {
	497	if ($line =~ /<body>/) {$past_header=1;}
	498	next;
	499	}
	500
	501	$line =~ s/<[^>]+>//g;
	502	if ($line =~ /\w/ && $past_header) { # we found some content...
	503	$was_successful=1;
	504	last;
	505	}
	506	}
	507	close HTML;
[1654]	508	}
[2574]	509
[2755]	510	if ($was_successful) {
	511	&util::rm("$output_filestem.err")
	512	if (-e "$output_filestem.err");
	513	# insert the (modified) table of contents, if it exists.
	514	if (-e "${output_filestem}_ToC.html") {
	515	&util::mv("$output_filestem.html","$output_filestem.src");
	516	my $open_failed=0;
	517	open HTMLSRC, "$output_filestem.src" \|\| ++$open_failed;
	518	open TOC, "${output_filestem}_ToC.html" \|\| ++$open_failed;
	519	open HTML, ">$output_filestem.html" \|\| ++$open_failed;
	520
	521	if ($open_failed) {
	522	close HTMLSRC;
	523	close TOC;
	524	close HTML;
	525	&util::mv("$output_filestem.src","$output_filestem.html");
	526	return 1;
	527	}
	528
	529	# print out header info from src html.
	530	while (($_ = <HTMLSRC>) =~ /\w/) {
	531	print HTML "$_";
	532	}
	533
	534	# print out table of contents, making links relative
	535	<TOC>; <TOC>; # ignore first 2 lines
	536	print HTML scalar(<TOC>); # line 3 = "<ol>\n"
	537	my $line;
	538	while ($line=<TOC>) {
	539	$line =~ s@</body></html>$@@ ; # only last line has this
	540	# make link relative
	541	$line =~ s@href=\"[^\#]+@href=\"@;
	542	print HTML $line;
	543	}
	544	close TOC;
	545
	546	# rest of html src
	547	while (<HTMLSRC>) {
	548	print HTML $_;
	549	}
	550	close HTMLSRC;
	551	close HTML;
	552
	553	&util::rm("${output_filestem}_ToC.html");
	554	&util::rm("${output_filestem}.src");
	555	}
	556	# we don't yet do anything with footnotes ($output_filestem_fn.html) :(
	557	return 1; # success
	558	}
	559
	560	if (-e "$output_filestem.err") {
	561	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	562	{
	563	print FAILLOG "Error - rtftohtml - couldn't extract text\n";
	564	print FAILLOG " (rtf file might be too recent):\n";
	565	open (ERRLOG, "$output_filestem.err");
	566	while (<ERRLOG>) {print FAILLOG $_;}
	567	close ERRLOG;
	568	close FAILLOG;
	569	}
	570	&util::rm("$output_filestem.err");
	571	}
	572
[2656]	573	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
	574
[1654]	575	return 0;
	576	}
	577
	578
[1445]	579	# Convert a pdf file to html with the pdftohtml command
	580
	581	sub pdf_to_html {
[2755]	582	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	583
[1692]	584	$cmd = "";
	585	if ($timeout) {$cmd = "ulimit -t $timeout;";}
[2755]	586	$cmd .= "perl -S pdftohtml.pl ";
[1928]	587	$cmd .= " \"$input_filename\" \"$output_filestem\"";
[2755]	588
	589	if ($ENV{'GSDLOS'} !~ /^windows$/i \|\| $is_winnt_2000) {
	590	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	591	} else {
	592	$cmd .= " > \"$output_filestem.err\"";
	593	}
	594
[2117]	595	$!=0;
[2241]	596
[2656]	597	my $retval=system($cmd);
	598	if ($retval!=0)
[1445]	599	{
[2755]	600	print STDERR "Error executing pdftohtml.pl";
[2117]	601	if ($!) {print STDERR ": $!";}
	602	print STDERR "\n";
[1445]	603	}
	604
[1692]	605	# make sure the converter made something
[2656]	606	if ($retval!=0 \|\| ! -s "$output_filestem.html")
[1692]	607	{
	608	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
[2656]	609	# print out the converter's std err, if any
	610	if (-s "$output_filestem.err") {
[1692]	611	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	612	print STDERR "pdftohtml error log:\n";
[1692]	613	while (<ERRLOG>) {
	614	print STDERR "$_";
	615	}
	616	close ERRLOG;
	617	}
[2656]	618	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
[2755]	619	if (-e "$output_filestem.err") {
	620	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	621	{
	622	open (ERRLOG, "$output_filestem.err");
	623	while (<ERRLOG>) {print FAILLOG $_;}
	624	close ERRLOG;
	625	close FAILLOG;
	626	}
	627	&util::rm("$output_filestem.err");
	628	}
[1692]	629	return 0;
	630	}
	631
[2656]	632	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[1445]	633	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	634	return 1;
	635	}
	636
	637	# Convert a PDF file to text with the pdftotext command
	638
	639	sub pdf_to_text {
[2755]	640	my ($dirname, $input_filename, $output_filestem) = @_;
[1445]	641
[2248]	642	my $cmd = "pdftotext \"$input_filename\" \"$output_filestem.text\"";
[2755]	643
	644	if ($ENV{'GSDLOS'} !~ /^windows$/i) {
	645	$cmd .= " > \"$output_filestem.out\" 2> \"$output_filestem.err\"";
	646	} else {
	647	$cmd .= " > \"$output_filestem.err\"";
	648	}
[1445]	649
[2060]	650	if (system($cmd)!=0)
[1445]	651	{
	652	print STDERR "Error executing $cmd: $!\n";
	653	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	654	}
	655
[2755]	656	# make sure there is some extracted text.
	657	if (-e "$output_filestem.text") {
	658	open (EXTR_TEXT, "$output_filestem.text") \|\| warn "open: $!";
	659	binmode(EXTR_TEXT); # just in case...
	660	my $line="";
	661	my $seen_text=0;
	662	while (($seen_text==0) && ($line=<EXTR_TEXT>)) {
	663	if ($line=~ /\w/) {$seen_text=1;}
	664	}
	665	close EXTR_TEXT;
	666	if ($seen_text==0) { # no text was extracted
	667	print STDERR "Error: pdftotext found no text\n";
	668	&util::rm("$output_filestem.text");
	669	}
	670	}
	671
[1692]	672	# make sure the converter made something
[2656]	673	if (! -s "$output_filestem.text")
[1692]	674	{
	675	# print out the converters std err, if any
[2656]	676	if (-s "$output_filestem.err") {
[1692]	677	open (ERRLOG, "$output_filestem.err") \|\| die "$!";
[2755]	678	print STDERR "pdftotext error log:\n";
[1692]	679	while (<ERRLOG>) {
	680	print STDERR "$_";
	681	}
	682	close ERRLOG;
	683	}
[2656]	684	# does this converter create a .out file?
	685	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	686	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	687	if (-e "$output_filestem.err") {
	688	if ($faillogfile ne "" && defined(open(FAILLOG,">>$faillogfile")))
	689	{
	690	open (ERRLOG,"$output_filestem.err");
	691	while (<ERRLOG>) {print FAILLOG $_;}
	692	close ERRLOG;
	693	close FAILLOG;
	694	}
	695	&util::rm("$output_filestem.err");
	696	}
[1692]	697	return 0;
	698	}
[1445]	699	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	700	return 1;
	701	}
	702
[2012]	703	# Convert a PostScript document to text
	704	# note - just using "ps2ascii" isn't good enough, as it
	705	# returns 0 for a postscript interpreter error. ps2ascii is just
	706	# a wrapper to "gs" anyway, so we use that cmd here.
[1445]	707
	708	sub ps_to_text {
[2241]	709	my ($input_filename, $output_filestem) = @_;
[1445]	710
[2241]	711	my $error = "";
	712
	713	# if we're on windows we'll fall straight through without attempting
	714	# to use gs
	715	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
	716	$error = "Windows does not support gs";
	717
	718	} else {
	719	my $cmd = "gs -q -dNODISPLAY -dNOBIND -dWRITESYSTEMDICT -dSIMPLE -c save ";
	720	$cmd .= "-f ps2ascii.ps \"$input_filename\" -c quit > \"$output_filestem.text\"";
	721	$cmd .= " 2> $output_filestem.err";
	722	$!=0;
[2600]	723
[2241]	724	my $retcode=system($cmd);
	725	$retcode = $? >> 8; # see man perlfunc - system for this...
	726	# if system returns -1 \| 127 (couldn't start program), look at $! for message
	727
	728	if ($retcode!=0) {if ($!) {$error=$!;} else {$error="couldn't run.\n";}}
	729	elsif (! -e "$output_filestem.text") {
	730	$error="did not create output file.\n";
[2012]	731	}
[2241]	732	else
	733	{ # make sure the interpreter didn't get an error. It is technically
	734	# possible for the actual text to start with this, but....
	735	open PSOUT, "$output_filestem.text";
	736	if (<PSOUT> =~ /^Error: (.*)/) {
	737	$error="interpreter error - \"$1\"";
	738	}
	739	close PSOUT;
	740	}
[2012]	741	}
[2241]	742
[2012]	743	if ($error ne "")
[1445]	744	{
[2755]	745	print STDERR "Warning: Error executing gs: $error\n";
[1445]	746	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
[2755]	747
	748	if ("$faillogfile" ne "" && defined(open (FAILLOG, ">>$faillogfile")))
	749	{
	750	print FAILLOG "gs - $error\n";
	751	if (-e "$output_filestem.err") {
	752	open(ERRLOG, "$output_filestem.err");
	753	while (<ERRLOG>) {print FAILLOG $_;}
	754	close ERRLOG;
	755	}
	756	close FAILLOG;
	757	}
[1445]	758	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
[2012]	759
[2755]	760
[2012]	761	# Fine then. We'll just do a lousy job by ourselves...
[2031]	762	# Based on 5-line regexp sed script found at:
[2012]	763	# http://snark.ptc.spbu.ru/mail-archives/lout/brown/msg00003.html
	764	#
[2755]	765	print STDERR "Stripping text from postscript\n";
[2012]	766	my $errorcode=0;
	767	open (IN, "$input_filename")
	768	\|\| ($errorcode=1, warn "Couldn't read file: $!");
	769	open (OUT, ">$output_filestem.text")
	770	\|\| ($errorcode=1, warn "Couldn't write file: $!");
	771	if ($errorcode) {print STDERR "errors\n";return 0;}
	772
[2031]	773	my $text=""; # this is for whole .ps file...
[2755]	774	$text = join('', <IN>); # see man perlport, under "System Resources"
[2031]	775	close IN;
	776
[2447]	777	# Make sure this is a ps file...
	778	if ($text !~ /^%!/) {
[2755]	779	print STDERR "Bad postscript header: not '%!'\n";
	780	if ($faillogfile ne "" && defined(open(FAILLOG, ">>$faillogfile")))
	781	{
	782	print FAILLOG "Bad postscript header: not '%!'\n";
	783	close FAILLOG;
	784	}
[2447]	785	return 0;
	786	}
	787
[2031]	788	# if ps has Page data, then use it to delete all stuff before it.
	789	$text =~ s/^.?%%Page:.?\n//s; # treat string as single line
	790
	791	# remove all leading non-data stuff
	792	$text =~ s/^.*?\(//s;
	793
	794	# remove all newline chars for easier processing
	795	$text =~ s/\n//g;
	796
	797	# Big assumption here - assume that if any co-ordinates are
	798	# given, then we are at the end of a sentence.
	799	$text =~ s/\)-?\d+\ -?\d+/\) \(\n\)/g;
	800
	801	# special characters--
	802	$text =~ s/\(\\|\)/\(\ - \)/g; # j -> em-dash?
	803
	804	# ? ps text formatting (eg italics?) ?
	805	$text =~ s/Fn\(f\)/\(\{\)/g; # f -> {
	806	$text =~ s/Fn\(g\)/\(\}\)/g; # g -> }
	807	$text =~ s/Fn\(j\)/\(\\|\)/g; # j -> \|
	808	# default - remove the rest
	809	$text =~ s/\ ?F.\((.+?)\)/\($1\)/g;
	810
	811	# attempt to add whitespace between words...
	812	# this is based purely on observation, and may be completely wrong...
	813	$text =~ s/([^F])[defghijkuy]\(/$1 \( /g;
	814	# eg I notice "b(" is sometimes NOT a space if preceded by a
	815	# negative number.
	816	$text =~ s/\)\d+ ?b\(/\) \( /g;
	817
	818	# change quoted braces to brackets
	819	$text =~ s/([^\\])\\\(/$1\{/g;
	820	$text =~ s/([^\\])\\\)/$1\}/g ;
	821
	822	# remove everything that is not between braces
	823	$text =~ s/\)([^\(\)])+?\(//sg ;
	824
	825	# remove any Trailer eof stuff.
	826	$text =~ s/\)[^\)]*$//sg;
	827
	828	### ligatures have special characters...
	829	$text =~ s/\\013/ff/g;
	830	$text =~ s/\\014/fi/g;
	831	$text =~ s/\\015/fl/g;
	832	$text =~ s/\\016/ffi/g;
	833	$text =~ s/\\214/fi/g;
	834	$text =~ s/\\215/fl/g;
	835	$text =~ s/\\017/\n\* /g; # asterisk?
	836	$text =~ s/\\023/\023/g; # e acute ('e)
	837	$text =~ s/\\177/\252/g; # u"
	838	# $text =~ s/ ?? /\344/g; # a"
	839
	840	print OUT "$text";
	841	close OUT;
[1960]	842	}
[2600]	843	# wrap the text - use a minimum length. ie, first space after this length.
	844	my $wrap_length=72;
	845	&util::mv("$output_filestem.text", "$output_filestem.text.tmp");
	846	open INFILE, "$output_filestem.text.tmp" \|\|
	847	die "Couldn't open file: $!";
	848	open OUTFILE, ">$output_filestem.text" \|\|
	849	die "Couldn't open file for writing: $!";
	850	my $line="";
	851	while ($line=<INFILE>) {
	852	while (length($line)>0) {
	853	if (length($line)>$wrap_length) {
	854	$line =~ s/^(.{$wrap_length}[^\s])\s//;
	855	print OUTFILE "$1\n";
	856	} else {
	857	print OUTFILE "$line";
	858	$line="";
	859	}
	860	}
	861	}
	862	close INFILE;
	863	close OUTFILE;
	864	&util::rm("$output_filestem.text.tmp");
	865
[1445]	866	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	867	return 1;
	868	}
	869
	870
	871	# Convert any file to HTML with a crude perl implementation of the
	872	# UNIX strings command.
	873
	874	sub any_to_html {
	875	($input_filename, $output_filestem) = @_;
	876
	877	# First generate a text file
	878	return 0 unless (&any_to_text($input_filename, $output_filestem));
	879
	880	# create an HTML file from the text file
	881	open(TEXT, "<$output_filestem.text");
	882	open(HTML, ">$output_filestem.html");
	883
[2241]	884	print HTML "<html><head>\n";
	885	print HTML "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html\">\n";
	886	print HTML "<META NAME=\"GENERATOR\" CONTENT=\"Greenstone any_to_html\">\n";
	887	print HTML "</head><body>\n\n";
[1734]	888
[2755]	889	my $line;
	890	while ($line=<TEXT>) {
	891	$line =~ s/</</g;
	892	$line =~ s/>/>/g;
	893	if ($line =~ /^\s*$/) {
	894	print HTML "<p>";
	895	} else {
	896	print HTML "<br> ", $line;
	897	}
[1445]	898	}
[1734]	899	print HTML "\n</body></html>\n";
[1445]	900
[2241]	901	close HTML;
	902	close TEXT;
	903
[1445]	904	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	905	return 1;
	906	}
	907
	908	# Convert any file to TEXT with a crude perl implementation of the
	909	# UNIX strings command.
[2755]	910	# Note - this assumes ascii charsets :( (jrm21)
[1445]	911
	912	sub any_to_text {
	913	($input_filename, $output_filestem) = @_;
	914
[2755]	915	open(IN, "<$input_filename") \|\| return 0;
[1734]	916	binmode(IN);
[2755]	917	open(OUT, ">$output_filestem.text") \|\| return 0;
[1445]	918
	919	my ($line);
[2755]	920	my $output_line_count = 0;
[1445]	921	while (<IN>) {
	922	$line = $_;
[1734]	923
[1445]	924	# delete anything that isn't a printable character
	925	$line =~ s/[^\040-\176]+/\n/sg;
	926
	927	# delete any string less than 10 characters long
[1734]	928	$line =~ s/^.{0,9}$/\n/mg;
	929	while ($line =~ /^.{1,9}$/m) {
	930	$line =~ s/^.{0,9}$/\n/mg;
[1445]	931	$line =~ s/\n+/\n/sg;
	932	}
	933
	934	# remove extraneous whitespace
	935	$line =~ s/\n+/\n/gs;
	936	$line =~ s/^\n//gs;
[1578]	937
[1445]	938	# output whatever is left
[1578]	939	if ($line =~ /[^\n ]/) {
[1445]	940	print OUT $line;
[2755]	941	++$output_line_count;
[1445]	942	}
	943	}
[2241]	944
	945	close OUT;
	946	close IN;
	947
[2755]	948	if ($output_line_count) { # try to protect against binary only formats
	949	return 1;
	950	}
	951
	952	&util::rm("$output_filestem.text");
	953	return 0;
	954
[1445]	955	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: