Context Navigation

source: trunk/gsdl/bin/script/gsConvert.pl@ 1587

Last change on this file since 1587 was 1578, checked in by paynter, 24 years ago
Uses wv version 0.6.0-gs
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 9.9 KB

Line
1	#!/usr/bin/perl -w
2
3	###########################################################################
4	#
5	# gsConvert.pl -- convert documents to HTML ot TEXT format
6	#
7	# A component of the Greenstone digital library software
8	# from the New Zealand Digital Library Project at the
9	# University of Waikato, New Zealand.
10	#
11	# Copyright (C) 1999 New Zealand Digital Library Project
12	#
13	# This program is free software; you can redistribute it and/or modify
14	# it under the terms of the GNU General Public License as published by
15	# the Free Software Foundation; either version 2 of the License, or
16	# (at your option) any later version.
17	#
18	# This program is distributed in the hope that it will be useful,
19	# but WITHOUT ANY WARRANTY; without even the implied warranty of
20	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21	# GNU General Public License for more details.
22	#
23	# You should have received a copy of the GNU General Public License
24	# along with this program; if not, write to the Free Software
25	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26	#
27	###########################################################################
28
29	# gsConvert.pl converts documents in a range of formats to HTML or TEXT
30	# by exploiting third-party programs. These are usually found in the
31	# $GSDLHOME/packages directory.
32	#
33	# Currently, we can convert Microsoft Word and Adobe PDF using specialised
34	# conversion utilities. We can convery any file to text with a perl
35	# implementation of the UNIX strings command.
36
37
38	BEGIN {
39	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
40	unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
41	}
42
43	use parsargv;
44	use util;
45	use Cwd;
46	use File::Basename;
47
48
49	sub print_usage
50	{
51	print STDERR "Usage: $0 [-type doc\|pdf] [-output html\|text] filename\n";
52	exit(1);
53	}
54
55
56	sub main
57	{
58	my (@ARGV) = @_;
59	my ($input_type,$output_type,$verbose);
60
61	# read command-line arguments
62	if (!parsargv::parse(\@ARGV,
63	'type/(doc\|pdf)/', \$input_type,
64	'output/(html\|text)/', \$output_type,
65	'verbose/\d+/0', \$verbose))
66	{
67	print_usage();
68	}
69
70	# Make sure the input file exists and can be opened for reading
71	if (scalar(@ARGV!=1)) {
72	print_usage();
73	}
74	my $input_filename = $ARGV[0];
75	if (!-r $input_filename) {
76	print STDERR "Error: unable to open $input_filename for reading\n";
77	exit(1);
78	}
79
80	# Deduce filenames
81	my ($tailname,$dirname,$suffix)
82	= File::Basename::fileparse($input_filename,'\..+');
83	my $output_filestem = &util::filename_cat($dirname,"$tailname");
84
85	if ($input_type eq "")
86	{
87	$input_type = substr($suffix,1,length($suffix)-1);
88	}
89
90	# Change to temporary working directory
91	my $stored_dir = cwd();
92	chdir ($dirname) \|\| die "Unable to change to directory $dirname";
93
94	# Select convert utility
95	if (!defined $input_type) {
96	print STDERR "Error: No filename extension or input type defined\n";
97	exit(1);
98	}
99	elsif ($input_type eq "doc") {
100	print &convertDOC($input_filename, $output_filestem, $output_type);
101	print "\n";
102	}
103	elsif ($input_type eq "pdf") {
104	print &convertPDF($dirname, $input_filename, $output_filestem, $output_type);
105	print "\n";
106	}
107	elsif ($input_type eq "ps") {
108	print &convertPS($input_filename, $output_filestem, $output_type);
109	print "\n";
110	}
111	else {
112	print STDERR "Error: Unable to convert type '$input_type'\n";
113	exit(1);
114	}
115
116	# restore to original working directory
117	chdir ($stored_dir) \|\| die "Unable to return to directory $stored_dir";
118
119	}
120
121	&main(@ARGV);
122
123
124
125	# Document-type conversion fucntions
126	#
127	# The following functions attempt to convert documents from their
128	# input type to the specified output type. If no output type was
129	# given, then they first attempt HTML, and then TEXT.
130	#
131	# Each returns the output type ("html" or "text") or "fail" if no
132	# conversion is possible.
133
134	# Convert a Microsoft word document
135
136	sub convertDOC {
137	($input_filename, $output_filestem, $output_type) = @_;
138
139	my $success = 0;
140
141	# Attempt specialised conversion to HTML
142	if (!$output_type \|\| ($output_type =~ /html/i)) {
143	$success = &doc_to_html($input_filename, $output_filestem);
144	if ($success) {
145	return "html";
146	}
147	}
148
149	# Attempt simple conversion to HTML
150	if (!$output_type \|\| ($output_type =~ /html/i)) {
151	$success = &any_to_html($input_filename, $output_filestem);
152	if ($success) {
153	return "html";
154	}
155	}
156
157	# Convert to text
158	if (!$output_type \|\| ($output_type =~ /text/i)) {
159	$success = any_to_text($input_filename, $output_filestem);
160	if ($success) {
161	return "text";
162	}
163	}
164
165	return "fail";
166
167	}
168
169
170	# Convert an Adobe PDF document
171
172	sub convertPDF {
173	($dirname, $input_filename, $output_filestem, $output_type) = @_;
174
175	my $success = 0;
176
177	# Attempt conversion to HTML
178	if (!$output_type \|\| ($output_type =~ /html/i)) {
179	$success = &pdf_to_html($dirname, $input_filename, $output_filestem);
180	if ($success) {
181	return "html";
182	}
183	}
184
185	# Attempt conversion to TEXT
186	if (!$output_type \|\| ($output_type =~ /text/i)) {
187	$success = &pdf_to_text($input_filename, $output_filestem);
188	if ($success) {
189	return "text";
190	}
191	}
192
193	return "fail";
194
195	}
196
197
198	# Convert an Adobe PostScript document
199
200	sub convertPS {
201	($input_filename, $output_filestem, $output_type) = @_;
202
203	my $success = 0;
204
205	# Attempt conversion to TEXT
206	if (!$output_type \|\| ($output_type =~ /text/i)) {
207	$success = &ps_to_text($input_filename, $output_filestem);
208	if ($success) {
209	return "text";
210	}
211	}
212
213	return "fail";
214
215	}
216
217
218
219	# Specific type-to-type cponversions
220	#
221	# Each of the following functions attempts to convert a document from
222	# a specific format to another. If they succeed yhey return 1 and leave
223	# the output document(s) in the appropriate place; if they fail they
224	# return 0 and delete any working files.
225
226
227	# Attempt to convert a word document to html with the wv program
228
229	sub doc_to_html {
230	($input_filename, $output_filestem) = @_;
231
232	# formulate the command
233	my $wv_home = &util::filename_cat($ENV{'GSDLHOME'}, "packages", "unix", "wv");
234	my $wv_conf = &util::filename_cat($wv_home, "lib", "wv", "wvHtml.xml");
235	my $wvWare = &util::filename_cat($wv_home, "bin", "wvWare");
236	return 0 unless (-e "$wvWare");
237	$cmd = "$wvWare --charset utf-8 --config $wv_conf";
238	$cmd .= " \"$input_filename\" > \"$output_filestem.html\" 2>\"$output_filestem.err\"";
239
240	# execute the command
241	if (system($cmd)>0)
242	{
243	print STDERR "Error executing wv converter: $!. Continuing...\n";
244	}
245
246	# Was the conversion successful?
247	if (-e "$output_filestem.html") {
248	open(TMP, "$output_filestem.html");
249	$line = <TMP>;
250	close(TMP);
251	if ($line && $line =~ /DOCTYPE HTML/) {
252	&util::rm("$output_filestem.err");
253	return 1;
254	} else {
255	# An error of some sort occurred
256	&util::rm("$output_filestem.html");
257	&util::rm("$output_filestem.err");
258	}
259	}
260	return 0;
261	}
262
263
264	# Convert a pdf file to html with the pdftohtml command
265
266	sub pdf_to_html {
267	($dirname, $input_filename, $output_filestem) = @_;
268
269	$cmd = "pdftohtml -F -d $dirname -o \"$output_filestem.html\" \"$input_filename\"";
270	$cmd .= " > $output_filestem.out";
271
272	if (system($cmd)>0)
273	{
274	print STDERR "Error executing $cmd: $!\n";
275	&util::rm("$output_filestem.html") if (-e "$output_filestem.html");
276	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
277	return 0;
278	}
279
280	&util::rm("$output_filestem.out") if (-e "$output_filestem.out");
281	return 1;
282	}
283
284
285	# Convert a PDF file to text with the pdftotext command
286
287	sub pdf_to_text {
288	($dirname, $input_filename, $output_filestem) = @_;
289
290	$cmd = "pdftotext \"$input_filename\" > \"$output_filestem.text\"";
291	$cmd .= " 2> $output_filestem.err";
292
293	if (system($cmd)>0)
294	{
295	print STDERR "Error executing $cmd: $!\n";
296	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
297	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
298	return 0;
299	}
300
301	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
302	return 1;
303	}
304
305
306	# Convert a PostScript document to text with ps2ascii
307
308	sub ps_to_text {
309	($input_filename, $output_filestem) = @_;
310
311	my $cmd = "ps2ascii \"$input_filename\" > \"$output_filestem.text\"";
312	$cmd .= " 2> $output_filestem.err";
313
314	if (system($cmd)>0)
315	{
316	print STDERR "Error executing $cmd: $!\n";
317	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
318	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
319	return 0;
320	}
321
322	&util::rm("$output_filestem.err") if (-e "$output_filestem.err");
323	return 1;
324	}
325
326
327	# Convert any file to HTML with a crude perl implementation of the
328	# UNIX strings command.
329
330	sub any_to_html {
331	($input_filename, $output_filestem) = @_;
332
333	# First generate a text file
334	return 0 unless (&any_to_text($input_filename, $output_filestem));
335
336	# create an HTML file from the text file
337	open(TEXT, "<$output_filestem.text");
338	open(HTML, ">$output_filestem.html");
339
340	print HTML '<html><head>
341	<META HTTP-EQUIV="Content-Type" CONTENT="text/html">
342	<META NAME="GENERATOR" CONTENT="Greenstone any_to_html">
343	</head><body>\n\n';
344	while (<TEXT>) {
345	print HTML "<p> ", $_;
346
347	}
348	print HTML "\n</body></html>]\n";
349
350	&util::rm("$output_filestem.text") if (-e "$output_filestem.text");
351	return 1;
352	}
353
354	# Convert any file to TEXT with a crude perl implementation of the
355	# UNIX strings command.
356
357	sub any_to_text {
358	($input_filename, $output_filestem) = @_;
359
360	open(IN, "<$input_filename");
361	open(OUT, ">$output_filestem.text");
362
363	my ($line);
364	while (<IN>) {
365	$line = $_;
366
367	# delete anything that isn't a printable character
368	$line =~ s/[^\040-\176]+/\n/sg;
369
370	# delete any string less than 10 characters long
371	$line =~ s/^[^\n]{0,9}$/\n/mg;
372	while ($line =~ /^[^\n]{1,9}$/m) {
373	$line =~ s/^[^\n]{0,9}$/\n/mg;
374	$line =~ s/\n+/\n/sg;
375	}
376
377	# remove extraneous whitespace
378	$line =~ s/\n+/\n/gs;
379	$line =~ s/^\n//gs;
380
381	# output whatever is left
382	if ($line =~ /[^\n ]/) {
383	print OUT $line;
384	}
385	}
386	return 1;
387	}
388
389
390

Note: See TracBrowser for help on using the repository browser.

Download in other formats: