Context Navigation

source: trunk/gsdl/bin/script/gsConvert.pl@ 1654

Last change on this file since 1654 was 1654, checked in by paynter, 24 years ago
Check .doc files to see if they are RTF files, Word 6/7/8 files that wv handles, or "unknown" files (which we strip of binary characters and hope the result is worthwhile).
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 12.8 KB

Rev	Line
[1445]	1	#!/usr/bin/perl -w
	2
	3	###########################################################################
	4	#
	5	# gsConvert.pl -- convert documents to HTML ot TEXT format
	6	#
	7	# A component of the Greenstone digital library software
	8	# from the New Zealand Digital Library Project at the
	9	# University of Waikato, New Zealand.
	10	#
	11	# Copyright (C) 1999 New Zealand Digital Library Project
	12	#
	13	# This program is free software; you can redistribute it and/or modify
	14	# it under the terms of the GNU General Public License as published by
	15	# the Free Software Foundation; either version 2 of the License, or
	16	# (at your option) any later version.
	17	#
	18	# This program is distributed in the hope that it will be useful,
	19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	21	# GNU General Public License for more details.
	22	#
	23	# You should have received a copy of the GNU General Public License
	24	# along with this program; if not, write to the Free Software
	25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	26	#
	27	###########################################################################
	28
	29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
	30	# by exploiting third-party programs. These are usually found in the
	31	# $GSDLHOME/packages directory.
	32	#
	33	# Currently, we can convert Microsoft Word and Adobe PDF using specialised
	34	# conversion utilities. We can convery any file to text with a perl
	35	# implementation of the UNIX strings command.
	36
	37
	38	BEGIN {
	39	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	40	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
	41	}
	42
	43	use parsargv;
	44	use util;
	45	use Cwd;
	46	use File::Basename;
	47
	48
	49	sub print_usage
	50	{
	51	print STDERR "Usage: $0 [-type doc\|pdf] [-output html\|text] filename\n";
	52	exit(1);
	53	}
	54
	55
	56	sub main
	57	{
	58	my (@ARGV) = @_;
	59	my ($input_type,$output_type,$verbose);
	60
	61	# read command-line arguments
	62	if (!parsargv::parse(\@ARGV,
	63	'type/(doc\|pdf)/', \$input_type,
	64	'output/(html\|text)/', \$output_type,
	65	'verbose/\d+/0', \$verbose))
	66	{
	67	print_usage();
	68	}
	69
	70	# Make sure the input file exists and can be opened for reading
	71	if (scalar(@ARGV!=1)) {
	72	print_usage();
	73	}
	74	my $input_filename = $ARGV[0];
	75	if (!-r $input_filename) {
	76	print STDERR "Error: unable to open $input_filename for reading\n";
	77	exit(1);
	78	}
	79
	80	# Deduce filenames
	81	my ($tailname,$dirname,$suffix)
	82	= File::Basename::fileparse($input_filename,'\..+');
	83	my $output_filestem = &util::filename_cat($dirname,"$tailname");
	84
	85	if ($input_type eq "")
	86	{
	87	$input_type = substr($suffix,1,length($suffix)-1);
	88	}
	89
	90	# Change to temporary working directory
	91	my $stored_dir = cwd();
	92	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
	93
	94	# Select convert utility
	95	if (!defined $input_type) {
	96	print STDERR "Error: No filename extension or input type defined\n";
	97	exit(1);
	98	}
	99	elsif ($input_type eq "doc") {
	100	print &convertDOC($input_filename, $output_filestem, $output_type);
	101	print "\n";
	102	}
	103	elsif ($input_type eq "pdf") {
	104	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
	105	print "\n";
	106	}
	107	elsif ($input_type eq "ps") {
	108	print &convertPS($input_filename, $output_filestem, $output_type);
	109	print "\n";
	110	}
	111	else {
	112	print STDERR "Error: Unable to convert type '$input_type'\n";
	113	exit(1);
	114	}
	115
	116	# restore to original working directory
	117	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
	118
	119	}
	120
	121	&main(@ARGV);
	122
	123
	124
	125	# Document-type conversion fucntions
	126	#
	127	# The following functions attempt to convert documents from their
	128	# input type to the specified output type. If no output type was
	129	# given, then they first attempt HTML, and then TEXT.
	130	#
	131	# Each returns the output type ("html" or "text") or "fail" if no
	132	# conversion is possible.
	133
	134	# Convert a Microsoft word document
	135
	136	sub convertDOC {
	137	($input_filename, $output_filestem, $output_type) = @_;
	138
[1654]	139	# Many .doc files are not in fact word documents!
	140	my $realtype = &find_docfile_type($input_filename);
	141
	142	if ($realtype eq "word678") {
	143	return &convertWord678($input_filename, $output_filestem, $output_type);
	144	} elsif ($realtype eq "rtf") {
	145	return &convertRTF($input_filename, $output_filestem, $output_type);
	146	} else {
	147	return &convertAnything($input_filename, $output_filestem, $output_type);
	148	}
	149	}
	150
	151	# Convert a Microsoft word 6/7/8 document
	152
	153	sub convertWord678 {
	154	($input_filename, $output_filestem, $output_type) = @_;
	155
[1445]	156	my $success = 0;
	157
	158	# Attempt specialised conversion to HTML
	159	if (!$output_type \|\| ($output_type =~ /html/i)) {
	160	$success = &doc_to_html($input_filename, $output_filestem);
	161	if ($success) {
	162	return "html";
	163	}
	164	}
	165
[1654]	166	return &convertAnything($input_filename, $output_filestem, $output_type);
	167	}
	168
	169
	170	# Convert a Rich Text Format (RTF) file
	171
	172	sub convertRTF {
	173	($input_filename, $output_filestem, $output_type) = @_;
	174
	175	my $success = 0;
	176
	177	# Attempt specialised conversion to HTML
	178	if (!$output_type \|\| ($output_type =~ /html/i)) {
	179	$success = &rtf_to_html($input_filename, $output_filestem);
	180	if ($success) {
	181	return "html";
	182	}
	183	}
	184
	185	return &convertAnything($input_filename, $output_filestem, $output_type);
	186	}
	187
	188
	189	# Convert an unidentified file
	190
	191	sub convertAnything {
	192	($input_filename, $output_filestem, $output_type) = @_;
	193
	194	my $success = 0;
	195
[1445]	196	# Attempt simple conversion to HTML
	197	if (!$output_type \|\| ($output_type =~ /html/i)) {
	198	$success = &any_to_html($input_filename, $output_filestem);
	199	if ($success) {
	200	return "html";
	201	}
	202	}
	203
	204	# Convert to text
	205	if (!$output_type \|\| ($output_type =~ /text/i)) {
	206	$success = any_to_text($input_filename, $output_filestem);
	207	if ($success) {
	208	return "text";
	209	}
	210	}
	211	return "fail";
	212	}
	213
	214
[1654]	215
[1445]	216	# Convert an Adobe PDF document
	217
	218	sub convertPDF {
	219	($dirname, $input_filename, $output_filestem, $output_type) = @_;
	220
	221	my $success = 0;
	222
	223	# Attempt conversion to HTML
	224	if (!$output_type \|\| ($output_type =~ /html/i)) {
	225	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
	226	if ($success) {
	227	return "html";
	228	}
	229	}
	230
	231	# Attempt conversion to TEXT
	232	if (!$output_type \|\| ($output_type =~ /text/i)) {
	233	$success = &pdf_to_text($input_filename, $output_filestem);
	234	if ($success) {
	235	return "text";
	236	}
	237	}
	238
	239	return "fail";
	240
	241	}
	242
	243
	244	# Convert an Adobe PostScript document
	245
	246	sub convertPS {
	247	($input_filename, $output_filestem, $output_type) = @_;
	248
	249	my $success = 0;
	250
	251	# Attempt conversion to TEXT
	252	if (!$output_type \|\| ($output_type =~ /text/i)) {
	253	$success = &ps_to_text($input_filename, $output_filestem);
	254	if ($success) {
	255	return "text";
	256	}
	257	}
	258
	259	return "fail";
	260
	261	}
	262
	263
[1654]	264	# Find the real type of a .doc file
	265	#
	266	# We seem to have alot of files with a .dco extension that are .rtf
	267	# files or Word 5 files. This function attempts to tell the difference.
[1445]	268
[1654]	269	sub find_docfile_type {
	270	($input_filename) = @_;
	271
	272	open(CHK, "<$input_filename");
	273	my $line = "";
	274	my $first = 1;
	275
	276	while (<CHK>) {
	277
	278	$line = $_;
	279
	280	if ($first) {
	281	# check to see if this is an rtf file
	282	if ($line =~ /^\{\\rtf/) {
	283	close(CHK);
	284	return "rtf";
	285	}
	286	}
	287
	288	# is theis a word 6/7/8 document?
	289	if ($line =~ /Word\.Document\.[678]/) {
	290	close(CHK);
	291	return "word678";
	292	}
	293
	294	$first = 0;
	295
	296	}
	297
	298	return "unknown";
	299	}
	300
	301
	302
[1445]	303	# Specific type-to-type cponversions
	304	#
	305	# Each of the following functions attempts to convert a document from
	306	# a specific format to another. If they succeed yhey return 1 and leave
	307	# the output document(s) in the appropriate place; if they fail they
	308	# return 0 and delete any working files.
	309
	310
	311	# Attempt to convert a word document to html with the wv program
	312
	313	sub doc_to_html {
	314	($input_filename, $output_filestem) = @_;
	315
	316	# formulate the command
[1578]	317	my $wv_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", "wv");
	318	my $wv_conf = &util::filename_cat($wv_home, "lib", "wv", "wvHtml.xml");
	319	my $wvWare = &util::filename_cat($wv_home, "bin", "wvWare");
	320	return 0 unless (-e "$wvWare");
[1654]	321	$cmd = "ulimit -t 20;";
	322	$cmd .= "$wvWare --charset utf-8 --config $wv_conf";
[1578]	323	$cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
[1654]	324
[1445]	325	# execute the command
	326	if (system($cmd)>0)
	327	{
[1578]	328	print STDERR "Error executing wv converter: $!. Continuing...\n";
[1445]	329	}
[1578]	330
[1445]	331	# Was the conversion successful?
	332	if (-e "$output_filestem.html") {
	333	open(TMP, "$output_filestem.html");
	334	$line = <TMP>;
	335	close(TMP);
[1578]	336	if ($line && $line =~ /DOCTYPE HTML/) {
[1445]	337	&util::rm("$output_filestem.err");
	338	return 1;
	339	} else {
	340	# An error of some sort occurred
	341	&util::rm("$output_filestem.html");
	342	&util::rm("$output_filestem.err");
	343	}
	344	}
	345	return 0;
	346	}
	347
	348
[1654]	349	# Attempt to convert an RTF document to html with rtftohtml
	350	#
	351	# rtf2html isn't distributed with Greenstone because it is not
	352	# distributed under teh GPL. If you know of a better solution,
	353	# please let me know.
	354
	355	sub rtf_to_html {
	356	($input_filename, $output_filestem) = @_;
	357
	358	# formulate the command
	359	my $r_cmd = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix",
	360	"rtf2html", "rtf2html", "rtf2html");
	361	$r_cmd = "rtf2html" unless (-e "$r_cmd");
	362	return 0 unless (-e "$r_cmd");
	363	$cmd = "ulimit -t 20;";
	364	$cmd .= "$r_cmd";
	365	$cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
	366
	367	# execute the command
	368	if (system($cmd)>0)
	369	{
	370	print STDERR "Error executing rtf converter: $!. Continuing...\n";
	371	}
	372
	373	# Was the conversion successful?
	374	if (-e "$output_filestem.html") {
	375	open(TMP, "$output_filestem.html");
	376	$line = <TMP>;
	377	close(TMP);
	378	if ($line && $line =~ /DOCTYPE HTML/) {
	379	&util::rm("$output_filestem.err");
	380	return 1;
	381	} else {
	382	# An error of some sort occurred
	383	&util::rm("$output_filestem.html");
	384	&util::rm("$output_filestem.err");
	385	}
	386	}
	387	return 0;
	388	}
	389
	390
[1445]	391	# Convert a pdf file to html with the pdftohtml command
	392
	393	sub pdf_to_html {
	394	($dirname, $input_filename, $output_filestem) = @_;
	395
	396	$cmd = "pdftohtml -F -d $dirname -o \"$output_filestem.html\" \"$input_filename\"";
	397	$cmd .= " > $output_filestem.out";
	398
	399	if (system($cmd)>0)
	400	{
	401	print STDERR "Error executing $cmd: $!\n";
	402	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
	403	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	404	return 0;
	405	}
	406
	407	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
	408	return 1;
	409	}
	410
	411
	412	# Convert a PDF file to text with the pdftotext command
	413
	414	sub pdf_to_text {
	415	($dirname, $input_filename, $output_filestem) = @_;
	416
	417	$cmd = "pdftotext \"$input_filename\" > \"$output_filestem.text\"";
	418	$cmd .= " 2> $output_filestem.err";
	419
	420	if (system($cmd)>0)
	421	{
	422	print STDERR "Error executing $cmd: $!\n";
	423	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	424	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	425	return 0;
	426	}
	427
	428	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	429	return 1;
	430	}
	431
	432
	433	# Convert a PostScript document to text with ps2ascii
	434
	435	sub ps_to_text {
	436	($input_filename, $output_filestem) = @_;
	437
	438	my $cmd = "ps2ascii \"$input_filename\" > \"$output_filestem.text\"";
	439	$cmd .= " 2> $output_filestem.err";
	440
	441	if (system($cmd)>0)
	442	{
	443	print STDERR "Error executing $cmd: $!\n";
	444	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	445	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	446	return 0;
	447	}
	448
	449	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
	450	return 1;
	451	}
	452
	453
	454	# Convert any file to HTML with a crude perl implementation of the
	455	# UNIX strings command.
	456
	457	sub any_to_html {
	458	($input_filename, $output_filestem) = @_;
	459
	460	# First generate a text file
	461	return 0 unless (&any_to_text($input_filename, $output_filestem));
	462
	463	# create an HTML file from the text file
	464	open(TEXT, "<$output_filestem.text");
	465	open(HTML, ">$output_filestem.html");
	466
	467	print HTML '<html><head>
	468	<META HTTP-EQUIV="Content-Type" CONTENT="text/html">
[1578]	469	<META NAME="GENERATOR" CONTENT="Greenstone any_to_html">
[1445]	470	</head><body>\n\n';
	471	while (<TEXT>) {
	472	print HTML "<p> ", $_;
	473
	474	}
	475	print HTML "\n</body></html>]\n";
	476
	477	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
	478	return 1;
	479	}
	480
	481	# Convert any file to TEXT with a crude perl implementation of the
	482	# UNIX strings command.
	483
	484	sub any_to_text {
	485	($input_filename, $output_filestem) = @_;
	486
	487	open(IN, "<$input_filename");
	488	open(OUT, ">$output_filestem.text");
	489
	490	my ($line);
	491	while (<IN>) {
	492	$line = $_;
[1578]	493
[1445]	494	# delete anything that isn't a printable character
	495	$line =~ s/[^\040-\176]+/\n/sg;
	496
	497	# delete any string less than 10 characters long
[1578]	498	$line =~ s/^[^\n]{0,9}$/\n/mg;
[1445]	499	while ($line =~ /^[^\n]{1,9}$/m) {
	500	$line =~ s/^[^\n]{0,9}$/\n/mg;
	501	$line =~ s/\n+/\n/sg;
	502	}
	503
	504	# remove extraneous whitespace
	505	$line =~ s/\n+/\n/gs;
	506	$line =~ s/^\n//gs;
[1578]	507
[1445]	508	# output whatever is left
[1578]	509	if ($line =~ /[^\n ]/) {
[1445]	510	print OUT $line;
	511	}
	512	}
	513	return 1;
	514	}
	515
	516
	517

Note: See TracBrowser for help on using the repository browser.

Download in other formats: